Moved becp to standard duplicated modules. Initial porting of becmod subroutines, but still a WIP! Other minor changes.

This commit is contained in:
Pietro Bonfa 2018-05-24 09:30:17 +02:00
parent 4f14b53cba
commit ffaf3612e4
24 changed files with 973 additions and 20 deletions

317
Modules/becmod_gpu.f90 Normal file
View File

@ -0,0 +1,317 @@
!
! Copyright (C) 2002-2011 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
#define DIMS2D(my_array) lbound(my_array,1):ubound(my_array,1),lbound(my_array,2):ubound(my_array,2)
!=----------------------------------------------------------------------------=!
MODULE becmod_gpum
!=----------------------------------------------------------------------------=!
USE kinds, ONLY : DP
IMPLICIT NONE
SAVE
!
TYPE bec_type_d
#if defined(__CUDA)
REAL(DP), ALLOCATABLE, DEVICE :: r_d(:, :)
#else
REAL(DP), ALLOCATABLE :: r_d(:, :)
#endif
#if defined(__CUDA)
COMPLEX(DP), ALLOCATABLE, DEVICE :: k_d(:, :)
#else
COMPLEX(DP), ALLOCATABLE :: k_d(:, :)
#endif
#if defined(__CUDA)
COMPLEX(DP), ALLOCATABLE, DEVICE :: nc_d(:, :, :)
#else
COMPLEX(DP), ALLOCATABLE :: nc_d(:, :, :)
#endif
INTEGER :: comm
INTEGER :: nbnd
INTEGER :: nproc
INTEGER :: mype
INTEGER :: nbnd_loc
INTEGER :: ibnd_begin
END TYPE bec_type_d
!
TYPE (bec_type_d), TARGET :: becp_d ! <beta|psi>
!
#if defined(__CUDA)
LOGICAL :: becp_r_ood = .false. ! used to flag out of date variables
LOGICAL :: becp_d_r_d_ood = .false. ! used to flag out of date variables
LOGICAL :: becp_k_ood = .false. ! used to flag out of date variables
LOGICAL :: becp_d_k_d_ood = .false. ! used to flag out of date variables
LOGICAL :: becp_nc_ood = .false. ! used to flag out of date variables
LOGICAL :: becp_d_nc_d_ood = .false. ! used to flag out of date variables
!
#endif
CONTAINS
!
SUBROUTINE using_becp_r(intento)
!
! intento is used to specify what the variable will be used for :
! 0 -> in , the variable needs to be synchronized but won't be changed
! 1 -> inout , the variable needs to be synchronized AND will be changed
! 2 -> out , NO NEED to synchronize the variable, everything will be overwritten
!
USE becmod, ONLY : becp
implicit none
INTEGER, INTENT(IN) :: intento
INTEGER :: intento_
intento_ = intento
#if defined(__CUDA)
!
IF (becp_r_ood) THEN
IF (.not. allocated(becp_d%r_d)) THEN
CALL errore('using_r_d', 'PANIC: sync of becp%r from becp_d%r_d with unallocated array. Bye!!', 1)
stop
END IF
IF (.not. allocated(becp%r)) THEN
IF (intento_ /= 2) THEN
print *, "WARNING: sync of becp%r with unallocated array and intento /= 2? Changed to 2!"
intento_ = 2
END IF
END IF
IF (intento_ < 2) THEN
print *, "Really copied becp%r D->H"
becp%r = becp_d%r_d
! update auxiliary variables
becp%comm = becp_d%comm
becp%nbnd = becp_d%nbnd
becp%nproc = becp_d%nproc
becp%mype = becp_d%mype
becp%nbnd_loc = becp_d%nbnd_loc
becp%ibnd_begin = becp_d%ibnd_begin
!
END IF
becp_r_ood = .false.
ENDIF
IF (intento_ > 0) becp_d_r_d_ood = .true.
#endif
END SUBROUTINE using_becp_r
!
SUBROUTINE using_becp_r_d(intento)
!
USE becmod, ONLY : becp
implicit none
INTEGER, INTENT(IN) :: intento
#if defined(__CUDA)
!
IF (.not. allocated(becp%r)) THEN
IF (intento /= 2) print *, "WARNING: sync of becp%r_d with unallocated array and intento /= 2?"
IF (allocated(becp_d%r_d)) DEALLOCATE(becp_d%r_d)
becp_d_r_d_ood = .false.
RETURN
END IF
! here we know that r is allocated, check if size is 0
IF ( SIZE(becp%r) == 0 ) THEN
print *, "Refusing to allocate 0 dimensional array becp_d%r_d. If used, code will crash."
RETURN
END IF
!
IF (becp_d_r_d_ood) THEN
IF ( allocated(becp_d%r_d) .and. (SIZE(becp_d%r_d)/=SIZE(becp%r))) deallocate(becp_d%r_d)
IF (.not. allocated(becp_d%r_d)) ALLOCATE(becp_d%r_d, MOLD=becp%r) ! this copy may be avoided
IF (intento < 2) THEN
print *, "Really copied becp%r H->D"
becp_d%r_d = becp%r
! update auxiliary variables
becp_d%comm = becp%comm
becp_d%nbnd = becp%nbnd
becp_d%nproc = becp%nproc
becp_d%mype = becp%mype
becp_d%nbnd_loc = becp%nbnd_loc
becp_d%ibnd_begin = becp%ibnd_begin
!
END IF
becp_d_r_d_ood = .false.
ENDIF
IF (intento > 0) becp_r_ood = .true.
#else
CALL errore('using_becp_d%r_d', 'Trying to use device data without device compilated code!', 1)
#endif
END SUBROUTINE using_becp_r_d
!
SUBROUTINE using_becp_k(intento)
!
! intento is used to specify what the variable will be used for :
! 0 -> in , the variable needs to be synchronized but won't be changed
! 1 -> inout , the variable needs to be synchronized AND will be changed
! 2 -> out , NO NEED to synchronize the variable, everything will be overwritten
!
USE becmod, ONLY : becp
implicit none
INTEGER, INTENT(IN) :: intento
INTEGER :: intento_
intento_ = intento
#if defined(__CUDA)
!
IF (becp_k_ood) THEN
IF (.not. allocated(becp_d%k_d)) THEN
CALL errore('using_k_d', 'PANIC: sync of becp%k from becp_d%k_d with unallocated array. Bye!!', 1)
stop
END IF
IF (.not. allocated(becp%k)) THEN
IF (intento_ /= 2) THEN
print *, "WARNING: sync of becp%k with unallocated array and intento /= 2? Changed to 2!"
intento_ = 2
END IF
END IF
IF (intento_ < 2) THEN
print *, "Really copied becp%k D->H"
becp%k = becp_d%k_d
! update auxiliary variables
becp%comm = becp_d%comm
becp%nbnd = becp_d%nbnd
becp%nproc = becp_d%nproc
becp%mype = becp_d%mype
becp%nbnd_loc = becp_d%nbnd_loc
becp%ibnd_begin = becp_d%ibnd_begin
!
END IF
becp_k_ood = .false.
ENDIF
IF (intento_ > 0) becp_d_k_d_ood = .true.
#endif
END SUBROUTINE using_becp_k
!
SUBROUTINE using_becp_k_d(intento)
!
USE becmod, ONLY : becp
implicit none
INTEGER, INTENT(IN) :: intento
#if defined(__CUDA)
!
IF (.not. allocated(becp%k)) THEN
IF (intento /= 2) print *, "WARNING: sync of becp%k_d with unallocated array and intento /= 2?"
IF (allocated(becp_d%k_d)) DEALLOCATE(becp_d%k_d)
becp_d_k_d_ood = .false.
RETURN
END IF
! here we know that k is allocated, check if size is 0
IF ( SIZE(becp%k) == 0 ) THEN
print *, "Refusing to allocate 0 dimensional array becp_d%k_d. If used, code will crash."
RETURN
END IF
!
IF (becp_d_k_d_ood) THEN
IF ( allocated(becp_d%k_d) .and. (SIZE(becp_d%k_d)/=SIZE(becp%k))) deallocate(becp_d%k_d)
IF (.not. allocated(becp_d%k_d)) ALLOCATE(becp_d%k_d, MOLD=becp%k) ! this copy may be avoided
IF (intento < 2) THEN
print *, "Really copied becp%k H->D"
becp_d%k_d = becp%k
! update auxiliary variables
becp_d%comm = becp%comm
becp_d%nbnd = becp%nbnd
becp_d%nproc = becp%nproc
becp_d%mype = becp%mype
becp_d%nbnd_loc = becp%nbnd_loc
becp_d%ibnd_begin = becp%ibnd_begin
!
END IF
becp_d_k_d_ood = .false.
ENDIF
IF (intento > 0) becp_k_ood = .true.
#else
CALL errore('using_becp_d%k_d', 'Trying to use device data without device compilated code!', 1)
#endif
END SUBROUTINE using_becp_k_d
!
SUBROUTINE using_becp_nc(intento)
!
! intento is used to specify what the variable will be used for :
! 0 -> in , the variable needs to be synchronized but won't be changed
! 1 -> inout , the variable needs to be synchronized AND will be changed
! 2 -> out , NO NEED to synchronize the variable, everything will be overwritten
!
USE becmod, ONLY : becp
implicit none
INTEGER, INTENT(IN) :: intento
INTEGER :: intento_
intento_ = intento
#if defined(__CUDA)
!
IF (becp_nc_ood) THEN
IF (.not. allocated(becp_d%nc_d)) THEN
CALL errore('using_nc_d', 'PANIC: sync of becp%nc from becp_d%nc_d with unallocated array. Bye!!', 1)
stop
END IF
IF (.not. allocated(becp%nc)) THEN
IF (intento_ /= 2) THEN
print *, "WARNING: sync of becp%nc with unallocated array and intento /= 2? Changed to 2!"
intento_ = 2
END IF
END IF
IF (intento_ < 2) THEN
print *, "Really copied becp%nc D->H"
becp%nc = becp_d%nc_d
! update auxiliary variables
becp%comm = becp_d%comm
becp%nbnd = becp_d%nbnd
becp%nproc = becp_d%nproc
becp%mype = becp_d%mype
becp%nbnd_loc = becp_d%nbnd_loc
becp%ibnd_begin = becp_d%ibnd_begin
!
END IF
becp_nc_ood = .false.
ENDIF
IF (intento_ > 0) becp_d_nc_d_ood = .true.
#endif
END SUBROUTINE using_becp_nc
!
SUBROUTINE using_becp_nc_d(intento)
!
USE becmod, ONLY : becp
implicit none
INTEGER, INTENT(IN) :: intento
#if defined(__CUDA)
!
IF (.not. allocated(becp%nc)) THEN
IF (intento /= 2) print *, "WARNING: sync of becp%nc_d with unallocated array and intento /= 2?"
IF (allocated(becp_d%nc_d)) DEALLOCATE(becp_d%nc_d)
becp_d_nc_d_ood = .false.
RETURN
END IF
! here we know that nc is allocated, check if size is 0
IF ( SIZE(becp%nc) == 0 ) THEN
print *, "Refusing to allocate 0 dimensional array becp_d%nc_d. If used, code will crash."
RETURN
END IF
!
IF (becp_d_nc_d_ood) THEN
IF ( allocated(becp_d%nc_d) .and. (SIZE(becp_d%nc_d)/=SIZE(becp%nc))) deallocate(becp_d%nc_d)
IF (.not. allocated(becp_d%nc_d)) ALLOCATE(becp_d%nc_d, MOLD=becp%nc) ! this copy may be avoided
IF (intento < 2) THEN
print *, "Really copied becp%nc H->D"
becp_d%nc_d = becp%nc
! update auxiliary variables
becp_d%comm = becp%comm
becp_d%nbnd = becp%nbnd
becp_d%nproc = becp%nproc
becp_d%mype = becp%mype
becp_d%nbnd_loc = becp%nbnd_loc
becp_d%ibnd_begin = becp%ibnd_begin
!
END IF
becp_d_nc_d_ood = .false.
ENDIF
IF (intento > 0) becp_nc_ood = .true.
#else
CALL errore('using_becp_d%nc_d', 'Trying to use device data without device compilated code!', 1)
#endif
END SUBROUTINE using_becp_nc_d
!
SUBROUTINE deallocate_becmod_gpu
IF( ALLOCATED( becp_d%r_d ) ) DEALLOCATE( becp_d%r_d )
IF( ALLOCATED( becp_d%k_d ) ) DEALLOCATE( becp_d%k_d )
IF( ALLOCATED( becp_d%nc_d ) ) DEALLOCATE( becp_d%nc_d )
END SUBROUTINE deallocate_becmod_gpu
!=----------------------------------------------------------------------------=!
END MODULE becmod_gpum
!=----------------------------------------------------------------------------=!

517
Modules/becmod_subs_gpu.f90 Normal file
View File

@ -0,0 +1,517 @@
!
! Copyright (C) 2001-2007 PWSCF group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!----------------------------------------------------------------------------
!
MODULE becmod_subs_gpum
! NOTA BENE : THE SUBROUTINES IN THIS FILE HAVE NOT BEEN TESTED YET!!!!!!!
!
! ... *bec* contain <beta|psi> - used in h_psi, s_psi, many other places
! ... calbec( npw, beta, psi, betapsi [, nbnd ] ) is an interface calculating
! ... betapsi(i,j) = <beta(i)|psi(j)> (the sum is over npw components)
! ... or betapsi(i,s,j)= <beta(i)|psi(s,j)> (s=polarization index)
!
USE kinds, ONLY : DP
USE control_flags, ONLY : gamma_only, smallmem
USE gvect, ONLY : gstart
USE noncollin_module, ONLY : noncolin, npol
USE becmod_gpum, ONLY : bec_type_d
!
SAVE
!
PRIVATE
!
INTERFACE calbec_gpu
!
MODULE PROCEDURE calbec_k_gpu, calbec_gamma_gpu, calbec_gamma_nocomm_gpu, calbec_nc_gpu, calbec_bec_type_gpu
!
END INTERFACE
INTERFACE becscal_gpu
!
MODULE PROCEDURE becscal_nck_gpu, becscal_gamma_gpu
!
END INTERFACE
!
PUBLIC :: allocate_bec_type_gpu, deallocate_bec_type_gpu, calbec_gpu, &
beccopy_gpu, becscal_gpu, is_allocated_bec_type_gpu, &
using_becp_auto, using_becp_d_auto
!
CONTAINS
!-----------------------------------------------------------------------
SUBROUTINE calbec_bec_type_gpu ( npw, beta_d, psi_d, betapsi_d, nbnd )
!-----------------------------------------------------------------------
!_
USE mp_bands, ONLY: intra_bgrp_comm
USE mp, ONLY: mp_get_comm_null
!
IMPLICIT NONE
COMPLEX (DP), INTENT (in) :: beta_d(:,:), psi_d(:,:)
TYPE (bec_type_d), TARGET, INTENT (inout) :: betapsi_d ! NB: must be INOUT otherwise
! the allocatd array is lost
INTEGER, INTENT (in) :: npw
INTEGER, OPTIONAL :: nbnd
#if defined(__CUDA)
attributes(DEVICE) :: beta_d, psi_d
#endif
!
INTEGER :: local_nbnd
INTEGER, EXTERNAL :: ldim_block, gind_block
INTEGER :: m_loc, m_begin, ip
REAL(DP), ALLOCATABLE :: dtmp_d(:,:) ! replace this with buffers !
INTEGER :: i, j, npwx
REAL(DP), POINTER :: betapsi_d_r_d(:,:)
#if defined(__CUDA)
attributes(DEVICE) :: dtmp_d, betapsi_d_r_d
#endif
!
IF ( present (nbnd) ) THEN
local_nbnd = nbnd
ELSE
local_nbnd = size ( psi_d, 2)
ENDIF
IF ( gamma_only ) THEN
!
IF( betapsi_d%comm == mp_get_comm_null() ) THEN
!
CALL calbec_gamma_gpu ( npw, beta_d, psi_d, betapsi_d%r_d, local_nbnd, intra_bgrp_comm )
!
ELSE
!
ALLOCATE( dtmp_d( SIZE( betapsi_d%r_d, 1 ), SIZE( betapsi_d%r_d, 2 ) ) )
!
DO ip = 0, betapsi_d%nproc - 1
m_loc = ldim_block( betapsi_d%nbnd , betapsi_d%nproc, ip )
m_begin = gind_block( 1, betapsi_d%nbnd, betapsi_d%nproc, ip )
IF( ( m_begin + m_loc - 1 ) > local_nbnd ) m_loc = local_nbnd - m_begin + 1
IF( m_loc > 0 ) THEN
CALL calbec_gamma_gpu ( npw, beta_d, psi_d(:,m_begin:m_begin+m_loc-1), dtmp_d, m_loc, betapsi_d%comm )
IF( ip == betapsi_d%mype ) THEN
npwx = SIZE( betapsi_d%r_d, 1 )
betapsi_d_r_d => betapsi_d%r_d
!$cuf kernel do(2) <<<*,*>>>
DO j=1,m_loc
DO i=1, npwx
betapsi_d_r_d(i,j) = dtmp_d(i,j)
END DO
END DO
END IF
END IF
END DO
DEALLOCATE( dtmp_d )
!
END IF
!
ELSEIF ( noncolin) THEN
!
CALL calbec_nc_gpu ( npw, beta_d, psi_d, betapsi_d%nc_d, local_nbnd )
!
ELSE
!
CALL calbec_k_gpu ( npw, beta_d, psi_d, betapsi_d%k_d, local_nbnd )
!
ENDIF
!
RETURN
!
END SUBROUTINE calbec_bec_type_gpu
!-----------------------------------------------------------------------
SUBROUTINE calbec_gamma_nocomm_gpu ( npw, beta_d, psi_d, betapsi_d, nbnd )
!-----------------------------------------------------------------------
USE mp_bands, ONLY: intra_bgrp_comm
IMPLICIT NONE
COMPLEX (DP), INTENT (in) :: beta_d(:,:), psi_d(:,:)
REAL (DP), INTENT (out) :: betapsi_d(:,:)
INTEGER, INTENT (in) :: npw
INTEGER, OPTIONAL :: nbnd
INTEGER :: m
#if defined(__CUDA)
attributes(DEVICE) :: beta_d, psi_d, betapsi_d
#endif
IF ( present (nbnd) ) THEN
m = nbnd
ELSE
m = size ( psi_d, 2)
ENDIF
CALL calbec_gamma_gpu ( npw, beta_d, psi_d, betapsi_d, m, intra_bgrp_comm )
RETURN
!
END SUBROUTINE calbec_gamma_nocomm_gpu
!-----------------------------------------------------------------------
SUBROUTINE calbec_gamma_gpu ( npw, beta_d, psi_d, betapsi_d, nbnd, comm )
!-----------------------------------------------------------------------
!
! ... matrix times matrix with summation index (k=1,npw) running on
! ... half of the G-vectors or PWs - assuming k=0 is the G=0 component:
! ... betapsi(i,j) = 2Re(\sum_k beta^*(i,k)psi(k,j)) + beta^*(i,0)psi(0,j)
!
USE mp, ONLY : mp_sum
#if defined(__CUDA)
USE cudafor
USE cublas
#endif
IMPLICIT NONE
COMPLEX (DP), INTENT (in) :: beta_d(:,:), psi_d(:,:)
REAL (DP), INTENT (out) :: betapsi_d(:,:)
INTEGER, INTENT (in) :: npw
INTEGER, INTENT (in) :: nbnd
INTEGER, INTENT (in) :: comm
!
#if defined(__CUDA)
attributes(DEVICE) :: beta_d, psi_d, betapsi_d
#endif
INTEGER :: nkb, npwx, m
INTEGER :: i,j
!
m = nbnd
!
nkb = size (beta_d, 2)
IF ( nkb == 0 ) RETURN
!
CALL start_clock( 'calbec' )
IF ( npw == 0 ) betapsi_d(:,:)=0.0_DP
npwx= size (beta_d, 1)
IF ( npwx /= size (psi_d, 1) ) CALL errore ('calbec', 'size mismatch', 1)
IF ( npwx < npw ) CALL errore ('calbec', 'size mismatch', 2)
#if defined(DEBUG)
WRITE (*,*) 'calbec gamma'
WRITE (*,*) nkb, size (betapsi_d,1) , m , size (betapsi_d, 2)
#endif
IF ( nkb /= size (betapsi_d,1) .or. m > size (betapsi_d, 2) ) &
CALL errore ('calbec', 'size mismatch', 3)
!
IF ( m == 1 ) THEN
!
CALL cudaDGEMV( 'C', 2*npw, nkb, 2.0_DP, beta_d, 2*npwx, psi_d, 1, 0.0_DP, &
betapsi_d, 1 )
IF ( gstart == 2 ) THEN
!betapsi_d(:,1) = betapsi_d(:,1) - beta_d(1,:)*psi_d(1,1)
!$cuf kernel do(1) <<<*,*>>>
DO i=1, nkb
betapsi_d(i,1) = betapsi_d(i,1) - DBLE(beta_d(1,i)*psi_d(1,1))
END DO
END IF
!
ELSE
!
CALL DGEMM( 'C', 'N', nkb, m, 2*npw, 2.0_DP, beta_d, 2*npwx, psi_d, &
2*npwx, 0.0_DP, betapsi_d, nkb )
IF ( gstart == 2 ) &
CALL cudaDGER( nkb, m, -1.0_DP, beta_d, 2*npwx, psi_d, 2*npwx, betapsi_d, nkb )
!
ENDIF
!
CALL mp_sum( betapsi_d( :, 1:m ), comm )
!
CALL stop_clock( 'calbec' )
!
RETURN
!
END SUBROUTINE calbec_gamma_gpu
!
!-----------------------------------------------------------------------
SUBROUTINE calbec_k_gpu ( npw, beta_d, psi_d, betapsi_d, nbnd )
!-----------------------------------------------------------------------
!
! ... matrix times matrix with summation index (k=1,npw) running on
! ... G-vectors or PWs : betapsi(i,j) = \sum_k beta^*(i,k) psi(k,j)
!
USE mp_bands, ONLY : intra_bgrp_comm
USE mp, ONLY : mp_sum
#if defined(__CUDA)
USE cudafor
USE cublas
#endif
IMPLICIT NONE
COMPLEX (DP), INTENT (in) :: beta_d(:,:), psi_d(:,:)
COMPLEX (DP), INTENT (out) :: betapsi_d(:,:)
INTEGER, INTENT (in) :: npw
INTEGER, OPTIONAL :: nbnd
!
INTEGER :: nkb, npwx, m
!
#if defined(__CUDA)
attributes(device) :: beta_d, psi_d, betapsi_d
#endif
nkb = size (beta_d, 2)
IF ( nkb == 0 ) RETURN
!
CALL start_clock( 'calbec' )
IF ( npw == 0 ) betapsi_d(:,:)=(0.0_DP,0.0_DP)
npwx= size (beta_d, 1)
IF ( npwx /= size (psi_d, 1) ) CALL errore ('calbec', 'size mismatch', 1)
IF ( npwx < npw ) CALL errore ('calbec', 'size mismatch', 2)
IF ( present (nbnd) ) THEN
m = nbnd
ELSE
m = size ( psi_d, 2)
ENDIF
#if defined(DEBUG)
WRITE (*,*) 'calbec k'
WRITE (*,*) nkb, size (betapsi_d,1) , m , size (betapsi_d, 2)
#endif
IF ( nkb /= size (betapsi_d,1) .or. m > size (betapsi_d, 2) ) &
CALL errore ('calbec', 'size mismatch', 3)
!
IF ( m == 1 ) THEN
!
CALL ZGEMV( 'C', npw, nkb, (1.0_DP,0.0_DP), beta_d, npwx, psi_d, 1, &
(0.0_DP, 0.0_DP), betapsi_d, 1 )
!
ELSE
!
CALL ZGEMM( 'C', 'N', nkb, m, npw, (1.0_DP,0.0_DP), &
beta_d, npwx, psi_d, npwx, (0.0_DP,0.0_DP), betapsi_d, nkb )
!
ENDIF
!
CALL mp_sum( betapsi_d( :, 1:m ), intra_bgrp_comm )
!
CALL stop_clock( 'calbec' )
!
RETURN
!
END SUBROUTINE calbec_k_gpu
!
!-----------------------------------------------------------------------
SUBROUTINE calbec_nc_gpu ( npw, beta_d, psi_d, betapsi_d, nbnd )
!-----------------------------------------------------------------------
!
! ... matrix times matrix with summation index (k below) running on
! ... G-vectors or PWs corresponding to two different polarizations:
! ... betapsi(i,1,j) = \sum_k=1,npw beta^*(i,k) psi(k,j)
! ... betapsi(i,2,j) = \sum_k=1,npw beta^*(i,k) psi(k+npwx,j)
!
USE mp_bands, ONLY : intra_bgrp_comm
USE mp, ONLY : mp_sum
#if defined(__CUDA)
USE cudafor
USE cublas
#endif
IMPLICIT NONE
COMPLEX (DP), INTENT (in) :: beta_d(:,:), psi_d(:,:)
COMPLEX (DP), INTENT (out) :: betapsi_d(:,:,:)
INTEGER, INTENT (in) :: npw
INTEGER, OPTIONAL :: nbnd
!
INTEGER :: nkb, npwx, npol, m
!
#if defined(__CUDA)
attributes(device) :: beta_d, psi_d, betapsi_d
#endif
nkb = size (beta_d, 2)
IF ( nkb == 0 ) RETURN
!
CALL start_clock ('calbec')
IF ( npw == 0 ) betapsi_d(:,:,:)=(0.0_DP,0.0_DP)
npwx= size (beta_d, 1)
IF ( 2*npwx /= size (psi_d, 1) ) CALL errore ('calbec', 'size mismatch', 1)
IF ( npwx < npw ) CALL errore ('calbec', 'size mismatch', 2)
IF ( present (nbnd) ) THEN
m = nbnd
ELSE
m = size ( psi_d, 2)
ENDIF
npol= size (betapsi_d, 2)
#if defined(DEBUG)
WRITE (*,*) 'calbec nc'
WRITE (*,*) nkb, size (betapsi_d,1) , m , size (betapsi_d, 3)
#endif
IF ( nkb /= size (betapsi_d,1) .or. m > size (betapsi_d, 3) ) &
CALL errore ('calbec', 'size mismatch', 3)
!
CALL ZGEMM ('C', 'N', nkb, m*npol, npw, (1.0_DP, 0.0_DP), beta_d, &
npwx, psi_d, npwx, (0.0_DP, 0.0_DP), betapsi_d, nkb)
!
CALL mp_sum( betapsi_d( :, :, 1:m ), intra_bgrp_comm )
!
CALL stop_clock( 'calbec' )
!
RETURN
!
END SUBROUTINE calbec_nc_gpu
!
!
!-----------------------------------------------------------------------
FUNCTION is_allocated_bec_type_gpu (bec_d) RESULT (isalloc)
!-----------------------------------------------------------------------
IMPLICIT NONE
TYPE (bec_type_d) :: bec_d
LOGICAL :: isalloc
isalloc = (allocated(bec_d%r_d) .or. allocated(bec_d%nc_d) .or. allocated(bec_d%k_d))
RETURN
!
!-----------------------------------------------------------------------
END FUNCTION is_allocated_bec_type_gpu
!-----------------------------------------------------------------------
!
!-----------------------------------------------------------------------
SUBROUTINE allocate_bec_type_gpu ( )
!-----------------------------------------------------------------------
USE becmod_gpum, ONLY : using_becp_r, using_becp_r_d
USE becmod_gpum, ONLY : using_becp_k, using_becp_k_d
USE becmod_gpum, ONLY : using_becp_nc, using_becp_nc_d
IMPLICIT NONE
!
IF ( gamma_only ) THEN
!
CALL using_becp_r(2); CALL using_becp_r_d(0)
!
ELSEIF ( noncolin) THEN
!
CALL using_becp_nc(2); CALL using_becp_nc_d(0)
!
ELSE
!
CALL using_becp_k(2); CALL using_becp_k_d(0)
!
ENDIF
!
RETURN
!
END SUBROUTINE allocate_bec_type_gpu
!
!-----------------------------------------------------------------------
SUBROUTINE deallocate_bec_type_gpu ( )
!-----------------------------------------------------------------------
!
USE becmod_gpum, ONLY : using_becp_r, using_becp_r_d
USE becmod_gpum, ONLY : using_becp_k, using_becp_k_d
USE becmod_gpum, ONLY : using_becp_nc, using_becp_nc_d
IMPLICIT NONE
!
!
IF ( gamma_only ) THEN
!
CALL using_becp_r(2); CALL using_becp_r_d(0)
!
ELSEIF ( noncolin) THEN
!
CALL using_becp_nc(2); CALL using_becp_nc_d(0)
!
ELSE
!
CALL using_becp_k(2); CALL using_becp_k_d(0)
!
ENDIF
!
RETURN
!
END SUBROUTINE deallocate_bec_type_gpu
SUBROUTINE beccopy_gpu(bec, bec1, nkb, nbnd)
#if defined(__CUDA)
USE cudafor
USE cublas
#endif
IMPLICIT NONE
TYPE(bec_type_d), INTENT(in) :: bec
TYPE(bec_type_d) :: bec1
INTEGER, INTENT(in) :: nkb, nbnd
IF (gamma_only) THEN
CALL dcopy(nkb*nbnd, bec%r_d, 1, bec1%r_d, 1)
ELSEIF (noncolin) THEN
CALL zcopy(nkb*npol*nbnd, bec%nc_d, 1, bec1%nc_d, 1)
ELSE
CALL zcopy(nkb*nbnd, bec%k_d, 1, bec1%k_d, 1)
ENDIF
RETURN
END SUBROUTINE beccopy_gpu
SUBROUTINE becscal_nck_gpu(alpha, bec_d, nkb, nbnd)
#if defined(__CUDA)
USE cudafor
USE cublas
#endif
IMPLICIT NONE
TYPE(bec_type_d), INTENT(INOUT) :: bec_d
COMPLEX(DP), INTENT(IN) :: alpha
INTEGER, INTENT(IN) :: nkb, nbnd
IF (gamma_only) THEN
CALL errore('becscal_nck','called in the wrong case',1)
ELSEIF (noncolin) THEN
CALL zscal(nkb*npol*nbnd, alpha, bec_d%nc_d, 1)
ELSE
CALL zscal(nkb*nbnd, alpha, bec_d%k_d, 1)
ENDIF
RETURN
END SUBROUTINE becscal_nck_gpu
SUBROUTINE becscal_gamma_gpu(alpha, bec_d, nkb, nbnd)
#if defined(__CUDA)
USE cudafor
USE cublas
#endif
IMPLICIT NONE
TYPE(bec_type_d), INTENT(INOUT) :: bec_d
REAL(DP), INTENT(IN) :: alpha
INTEGER, INTENT(IN) :: nkb, nbnd
IF (gamma_only) THEN
CALL dscal(nkb*nbnd, alpha, bec_d%r_d, 1)
ELSE
CALL errore('becscal_gamma','called in the wrong case',1)
ENDIF
RETURN
END SUBROUTINE becscal_gamma_gpu
!
SUBROUTINE using_becp_auto(intento)
USE becmod_gpum, ONLY : using_becp_r
USE becmod_gpum, ONLY : using_becp_k
USE becmod_gpum, ONLY : using_becp_nc
IMPLICIT NONE
INTEGER, INTENT(IN) :: intento
!
!
IF ( gamma_only ) THEN
!
CALL using_becp_r(intento)
!
ELSEIF ( noncolin) THEN
!
CALL using_becp_nc(intento)
!
ELSE
!
CALL using_becp_k(intento)
!
ENDIF
END SUBROUTINE using_becp_auto
!
SUBROUTINE using_becp_d_auto(intento)
USE becmod_gpum, ONLY : using_becp_r_d
USE becmod_gpum, ONLY : using_becp_k_d
USE becmod_gpum, ONLY : using_becp_nc_d
IMPLICIT NONE
INTEGER, INTENT(IN) :: intento
!
!
IF ( gamma_only ) THEN
!
CALL using_becp_r_d(intento)
!
ELSEIF ( noncolin) THEN
!
CALL using_becp_nc_d(intento)
!
ELSE
!
CALL using_becp_k_d(intento)
!
ENDIF
END SUBROUTINE using_becp_d_auto
END MODULE becmod_subs_gpum

View File

@ -0,0 +1,39 @@
!----------------------------------------------
! ... this file contains a number of subroutines optionally interfaced
! ... to cublas
!----------------------------------------------
SUBROUTINE cudaDGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
#if defined(__CUDA)
use cudafor
use cublas
#endif
implicit none
DOUBLE PRECISION :: ALPHA,BETA
INTEGER :: INCX,INCY,LDA,M,N
CHARACTER :: TRANS
DOUBLE PRECISION :: A(LDA,*),X(*),Y(*)
#if defined(__CUDA)
attributes(device) :: A, X, Y
#endif
!
call DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
!
END SUBROUTINE cudaDGEMV
SUBROUTINE cudaDGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
#if defined(__CUDA)
use cudafor
use cublas
#endif
! .. Scalar Arguments ..
DOUBLE PRECISION :: ALPHA
INTEGER :: INCX, INCY, LDA, M, N
! .. Array Arguments ..
DOUBLE PRECISION :: A( LDA, * ), X( * ), Y( * )
#if defined(__CUDA)
attributes(device) :: A, X, Y
#endif
CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
END SUBROUTINE cudaDGER

View File

@ -3,7 +3,12 @@ MODULES += \
qe_buffers.o \
wavefunctions_gpu.o \
recvec_gpu.o \
uspp_gpu.o
uspp_gpu.o \
becmod_gpu.o \
becmod_subs_gpu.o \
cuda_subroutines.o
recvec_gpu.o : recvec.o
uspp_gpu.o : uspp.o
becmod_gpu.o : becmod.o
becmod_subs_gpu.o : becmod_gpu.o

View File

@ -31,6 +31,7 @@ SUBROUTINE add_vuspsi( lda, n, m, hpsi )
USE becmod, ONLY: bec_type, becp
!
USE uspp_gpum, ONLY : using_vkb, using_indv_ijkb0, using_deeq, using_deeq_nc
USE becmod_subs_gpum, ONLY : using_becp_auto
!
IMPLICIT NONE
!
@ -46,6 +47,7 @@ SUBROUTINE add_vuspsi( lda, n, m, hpsi )
!
!
CALL start_clock( 'add_vuspsi' )
CALL using_becp_auto(0)
!
IF ( gamma_only ) THEN
!

View File

@ -193,6 +193,7 @@ SUBROUTINE diag_bands( iter, ik, avg_iter )
USE wvfct_gpum, ONLY : et_d, using_et, using_et_d, &
g2kin_d, using_g2kin, using_g2kin_d
USE uspp_gpum, ONLY : using_vkb
USE becmod_subs_gpum, ONLY : using_becp_auto
!
IMPLICIT NONE
!
@ -244,6 +245,7 @@ SUBROUTINE diag_bands( iter, ik, avg_iter )
! ... allocate space for <beta_i|psi_j> - used in h_psi and s_psi
!
CALL allocate_bec_type ( nkb, nbnd, becp, intra_bgrp_comm )
CALL using_becp_auto(2)
!
npw = ngk(ik)
IF ( gamma_only ) THEN
@ -259,6 +261,7 @@ SUBROUTINE diag_bands( iter, ik, avg_iter )
! ... deallocate work space
!
CALL deallocate_bec_type ( becp )
CALL using_becp_auto(2)
DEALLOCATE( s_diag )
DEALLOCATE( h_diag )
call using_h_diag(2); call using_s_diag(2)

View File

@ -35,6 +35,7 @@ SUBROUTINE compute_becsum ( iflag )
!
USE wavefunctions_module_gpum, ONLY : using_evc
USE uspp_gpum, ONLY : using_vkb
USE becmod_subs_gpum, ONLY : using_becp_auto
!
IMPLICIT NONE
!
@ -55,6 +56,7 @@ SUBROUTINE compute_becsum ( iflag )
!
becsum(:,:,:) = 0.D0
CALL allocate_bec_type (nkb,nbnd, becp,intra_bgrp_comm)
CALL using_becp_auto(2)
call divide (inter_bgrp_comm, nbnd, ibnd_start, ibnd_end )
this_bgrp_nbnd = ibnd_end - ibnd_start + 1
!
@ -93,6 +95,7 @@ SUBROUTINE compute_becsum ( iflag )
ENDIF
!
CALL deallocate_bec_type ( becp )
CALL using_becp_auto(2)
!
CALL stop_clock( 'compute_becsum' )
!

View File

@ -42,6 +42,7 @@ SUBROUTINE force_hub(forceh)
USE wavefunctions_module_gpum, ONLY : using_evc
USE uspp_gpum, ONLY : using_vkb, using_indv_ijkb0
USE becmod_subs_gpum, ONLY : using_becp_auto
IMPLICIT NONE
REAL (DP) :: forceh(3,nat) ! output: the Hubbard forces
@ -66,6 +67,8 @@ SUBROUTINE force_hub(forceh)
call allocate_bec_type ( nkb, nbnd, becp)
call allocate_bec_type ( nwfcU, nbnd, proj )
!
CALL using_becp_auto(2)
!
! poor-man parallelization over bands
! - if nproc_pool=1 : nb_s=1, nb_e=nbnd, mykey=0
! - if nproc_pool<=nbnd:each processor calculates band nb_s to nb_e; mykey=0
@ -90,7 +93,7 @@ SUBROUTINE force_hub(forceh)
CALL get_buffer (evc, nwordwfc, iunwfc, ik)
IF (nks > 1) CALL using_evc(1)
CALL using_vkb(1)
CALL using_vkb(1); CALL using_becp_auto(2)
CALL init_us_2 (npw,igk_k(1,ik),xk(1,ik),vkb)
CALL calbec( npw, vkb, evc, becp )
CALL s_psi (npwx, npw, nbnd, evc, spsi )
@ -145,6 +148,7 @@ SUBROUTINE force_hub(forceh)
DEALLOCATE( wfcatom )
DEALLOCATE( spsi )
DEALLOCATE( dns )
CALL using_becp_auto(2)
IF (nspin == 1) forceh(:,:) = 2.d0 * forceh(:,:)
!

View File

@ -38,6 +38,7 @@ SUBROUTINE force_us( forcenl )
USE wvfct_gpum, ONLY : using_et
USE uspp_gpum, ONLY : using_vkb, using_indv_ijkb0, using_qq_at, &
using_deeq
USE becmod_subs_gpum, ONLY : using_becp_auto
!
IMPLICIT NONE
!
@ -52,6 +53,7 @@ SUBROUTINE force_us( forcenl )
forcenl(:,:) = 0.D0
!
CALL allocate_bec_type ( nkb, nbnd, becp, intra_bgrp_comm )
CALL using_becp_auto(2)
CALL allocate_bec_type ( nkb, nbnd, dbecp, intra_bgrp_comm )
ALLOCATE( vkb1( npwx, nkb ) )
IF (noncolin) then
@ -77,7 +79,7 @@ SUBROUTINE force_us( forcenl )
CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb )
END IF
!
CALL using_vkb(0)
CALL using_vkb(0); CALL using_becp_auto(2)
CALL calbec ( npw, vkb, evc, becp )
!
DO ipol = 1, 3
@ -105,6 +107,7 @@ SUBROUTINE force_us( forcenl )
!
! ... if sums over bands are parallelized over the band group
!
CALL using_becp_auto(1)
IF( becp%comm /= mp_get_comm_null() ) CALL mp_sum( forcenl, becp%comm )
!
IF (noncolin) THEN
@ -115,6 +118,7 @@ SUBROUTINE force_us( forcenl )
DEALLOCATE( vkb1 )
CALL deallocate_bec_type ( dbecp )
CALL deallocate_bec_type ( becp )
CALL using_becp_auto(2)
!
! ... collect contributions across pools from all k-points
!

View File

@ -103,6 +103,7 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi )
!
USE wvfct_gpum, ONLY : using_g2kin
USE scf_gpum, ONLY : using_vrs
USE becmod_subs_gpum, ONLY : using_becp_auto
!
IMPLICIT NONE
!
@ -126,6 +127,7 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi )
IF ( gamma_only ) THEN
!
IF ( real_space .and. nkb > 0 ) then
CALL using_becp_auto(1)
!
! ... real-space algorithm
! ... fixme: real_space without beta functions does not make sense
@ -167,6 +169,7 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi )
! ... real-space algorithm
! ... fixme: real_space without beta functions does not make sense
!
CALL using_becp_auto(1)
IF ( dffts%has_task_groups ) then
incr = fftx_ntgrp(dffts)
ELSE
@ -199,6 +202,8 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi )
! ... (not in the real-space case: it is done together with V_loc)
!
IF ( nkb > 0 .AND. .NOT. real_space) THEN
!
CALL using_becp_auto(1)
!
CALL start_clock( 'h_psi:calbec' )
CALL calbec ( n, vkb, psi, becp, m )
@ -242,6 +247,7 @@ SUBROUTINE h_psi_( lda, n, m, psi, hpsi )
CALL vexxace_k(lda,m,psi,ee,hpsi)
END IF
ELSE
CALL using_becp_auto(0)
CALL vexx( lda, n, m, psi, hpsi, becp )
END IF
END IF

View File

@ -113,6 +113,7 @@ SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d )
!
USE wvfct_gpum, ONLY : g2kin_d, using_g2kin_d
USE uspp_gpum, ONLY : using_vkb
USE becmod_subs_gpum, ONLY : using_becp_auto
!
IMPLICIT NONE
!
@ -132,6 +133,7 @@ SUBROUTINE h_psi__gpu( lda, n, m, psi_d, hpsi_d )
CALL using_g2kin_d(0)
CALL using_vrs_d(0)
CALL using_vkb(0)
CALL using_becp_auto(0)
!
hpsi_d (:, 1:m) = (0.0_dp, 0.0_dp)

View File

@ -218,6 +218,7 @@ SUBROUTINE new_ns(ns)
USE uspp_param, ONLY : nhm, nh
!
USE uspp_gpum, ONLY : using_vkb, using_indv_ijkb0
USE becmod_subs_gpum, ONLY : using_becp_auto
!
IMPLICIT NONE
REAL(DP), INTENT(IN) :: q(nwfcU,nhm,nat)
@ -232,6 +233,7 @@ SUBROUTINE new_ns(ns)
! compute <beta|psi>
!
CALL allocate_bec_type (nkb, nbnd, becp)
CALL using_becp_auto(2)
CALL using_vkb(1)
CALL init_us_2 (npw,igk_k(1,ik),xk(1,ik),vkb)
CALL using_evc(0)
@ -243,6 +245,7 @@ SUBROUTINE new_ns(ns)
p%k(:,:) = (0.0_DP,0.0_DP)
ENDIF
!
CALL using_becp_auto(0)
DO nt = 1, ntyp
!
DO na = 1, nat
@ -279,6 +282,7 @@ SUBROUTINE new_ns(ns)
END DO
!
CALL deallocate_bec_type ( becp )
CALL using_becp_auto(2)
RETURN
END SUBROUTINE compute_pproj

View File

@ -55,6 +55,7 @@ SUBROUTINE orbm_kubo()
!
USE scf_gpum, ONLY : using_vrs
USE uspp_gpum, ONLY : using_vkb
USE becmod_subs_gpum, ONLY : using_becp_auto
! --- Avoid implicit definitions ---
IMPLICIT NONE
@ -123,6 +124,7 @@ SUBROUTINE orbm_kubo()
CALL using_vrs(1)
CALL set_vrs( vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, nspin, doublegrid )
CALL allocate_bec_type ( nkb, nbnd, becp )
CALL using_becp_auto(2)
! Initializations
! Define small number
@ -543,6 +545,7 @@ SUBROUTINE orbm_kubo()
! Deallocate arrays
CALL deallocate_bec_type ( becp )
CALL using_becp_auto(2)
DEALLOCATE(temp)
DEALLOCATE(evc_k)
DEALLOCATE(evc_kp)

View File

@ -30,7 +30,8 @@ SUBROUTINE orthoUwfc
USE control_flags, ONLY : gamma_only
USE noncollin_module, ONLY : noncolin, npol
!
USE uspp_gpum, ONLY : using_vkb
USE uspp_gpum, ONLY : using_vkb
USE becmod_subs_gpum, ONLY : using_becp_auto
!
IMPLICIT NONE
!
@ -80,6 +81,7 @@ SUBROUTINE orthoUwfc
! Allocate the array becp = <beta|wfcatom>
CALL allocate_bec_type (nkb,natomwfc, becp)
CALL using_becp_auto(2)
DO ik = 1, nks
@ -107,6 +109,7 @@ SUBROUTINE orthoUwfc
ENDDO
DEALLOCATE (wfcatom, swfcatom)
CALL deallocate_bec_type ( becp )
CALL using_becp_auto(2)
!
RETURN

View File

@ -332,6 +332,10 @@ CONTAINS
USE becmod, ONLY: becp, calbec, allocate_bec_type, deallocate_bec_type
USE exx, ONLY : exxenergy2, fock2
USE funct, ONLY : dft_is_hybrid
!
USE becmod_subs_gpum, ONLY : using_becp_auto
!
IMPLICIT NONE
COMPLEX(DP), ALLOCATABLE :: aux(:)
INTEGER :: npw, ibnd, j, ig, ik,ikk, ispin, na, nt, ijkb0, ikb,jkb, ih,jh
@ -340,6 +344,7 @@ CONTAINS
ALLOCATE (aux(dfftp%nnr))
CALL allocate_bec_type ( nkb, nbnd, becp )
CALL using_becp_auto(2)
ek = 0.d0
eloc= 0.d0
@ -479,6 +484,7 @@ CONTAINS
END IF
!
CALL deallocate_bec_type (becp)
CALL using_becp_auto(2)
DEALLOCATE (aux)
WRITE (stdout,*)

View File

@ -1592,6 +1592,8 @@ MODULE realus
USE mp_bands, ONLY : intra_bgrp_comm
USE mp, ONLY : mp_sum
!
USE becmod_gpum, ONLY : using_becp_k
!
IMPLICIT NONE
!
INTEGER, INTENT(in) :: ibnd, last
@ -1606,6 +1608,7 @@ MODULE realus
!
!
CALL start_clock( 'calbec_rs' )
CALL using_becp_k(1) ! intento=2?
!
IF( dffts%has_task_groups ) CALL errore( 'calbec_rs_k', 'task_groups not implemented', 1 )
@ -1673,6 +1676,7 @@ MODULE realus
USE fft_base, ONLY : dffts
!
USE uspp_gpum, ONLY : using_qq_at
USE becmod_gpum, ONLY : using_becp_r
!
IMPLICIT NONE
!
@ -1690,6 +1694,7 @@ MODULE realus
! Sync
CALL using_qq_at(0)
CALL using_becp_r(0)
!
fac = sqrt(omega)
!
@ -1760,6 +1765,7 @@ MODULE realus
USE fft_base, ONLY : dffts
!
USE uspp_gpum, ONLY : using_qq_at
USE becmod_gpum, ONLY : using_becp_k
!
IMPLICIT NONE
!
@ -1778,6 +1784,7 @@ MODULE realus
! Sync
CALL using_qq_at(0)
CALL using_becp_k(0)
call set_xkphase(current_k)
@ -1853,6 +1860,7 @@ MODULE realus
USE fft_base, ONLY : dffts
!
USE uspp_gpum, ONLY : using_deeq
USE becmod_gpum, ONLY : using_becp_r
!
IMPLICIT NONE
!
@ -1873,6 +1881,7 @@ MODULE realus
ELSE !non task groups part starts here
CALL using_deeq(0)
CALL using_becp_r(0)
!
fac = sqrt(omega)
!
@ -1958,6 +1967,7 @@ MODULE realus
USE fft_base, ONLY : dffts
!
USE uspp_gpum, ONLY : using_deeq
USE becmod_gpum, ONLY : using_becp_k
!
IMPLICIT NONE
!
@ -1975,6 +1985,7 @@ MODULE realus
IF( dffts%has_task_groups ) CALL errore( 'add_vuspsir_k', 'task_groups not implemented', 1 )
CALL using_deeq(0)
CALL using_becp_k(0)
call set_xkphase(current_k)
!

View File

@ -169,6 +169,7 @@ SUBROUTINE s_psi_( lda, n, m, psi, spsi )
! ... gamma version
!
USE mp, ONLY: mp_get_comm_null, mp_circular_shift_left
USE becmod_gpum, ONLY : using_becp_r
!
IMPLICIT NONE
!
@ -185,6 +186,7 @@ SUBROUTINE s_psi_( lda, n, m, psi, spsi )
!
CALL using_indv_ijkb0(0)
CALL using_qq_at(0)
CALL using_becp_r(0)
!
IF( becp%comm == mp_get_comm_null() ) THEN
nproc = 1
@ -282,6 +284,8 @@ SUBROUTINE s_psi_( lda, n, m, psi, spsi )
!
! ... k-points version
!
USE becmod_gpum, ONLY : using_becp_k
!
IMPLICIT NONE
!
! ... local variables
@ -293,6 +297,7 @@ SUBROUTINE s_psi_( lda, n, m, psi, spsi )
!
CALL using_indv_ijkb0(0)
CALL using_qq_at(0)
CALL using_becp_k(0)
!
ALLOCATE( ps( nkb, m ), STAT=ierr )
IF( ierr /= 0 ) &
@ -344,6 +349,8 @@ SUBROUTINE s_psi_( lda, n, m, psi, spsi )
!
! ... k-points noncolinear/spinorbit version
!
USE becmod_gpum, ONLY : using_becp_nc
!
IMPLICIT NONE
!
! here the local variables
@ -356,6 +363,7 @@ SUBROUTINE s_psi_( lda, n, m, psi, spsi )
CALL using_indv_ijkb0(0)
IF ( .NOT. lspinorb ) CALL using_qq_at(0)
IF (lspinorb) CALL using_qq_so(0)
CALL using_becp_nc(0)
!
ALLOCATE (ps(nkb,npol,m),STAT=ierr)
IF( ierr /= 0 ) &

View File

@ -104,6 +104,7 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d )
invfft_orbital_k, fwfft_orbital_k, calbec_rs_k, s_psir_k
!
USE uspp_gpum, ONLY : vkb_d, using_vkb_d
USE becmod_gpum, ONLY : using_becp_r, using_becp_k, using_becp_nc
!
IMPLICIT NONE
!
@ -208,6 +209,7 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d )
! the product vkb and psi
!
CALL using_vkb_d(0)
CALL using_becp_r(0)
!
IF( becp%comm == mp_get_comm_null() ) THEN
nproc = 1
@ -260,7 +262,7 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d )
!
IF( becp%comm == mp_get_comm_null() ) THEN
IF ( m == 1 ) THEN
CALL myDGEMV( 'N', 2 * n, nkb, 1.D0, vkb_d, &
CALL cudaDGEMV( 'N', 2 * n, nkb, 1.D0, vkb_d, &
2 * lda, ps_d, 1, 1.D0, spsi_d, 1 )
ELSE
CALL cublasDGEMM( 'N', 'N', 2 * n, m, nkb, 1.D0, vkb_d, &
@ -328,6 +330,7 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d )
! sync vkb if needed
CALL using_vkb_d(0)
CALL using_becp_k(0)
!
ps(:,:) = ( 0.D0, 0.D0 )
!
@ -394,6 +397,7 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d )
! sync vkb if needed
CALL using_vkb_d(0)
CALL using_becp_nc(0)
ps(:,:,:) = (0.D0,0.D0)
!
@ -448,18 +452,7 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d )
END SUBROUTINE s_psi__gpu
!@nje
SUBROUTINE myDGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
use cudafor
use cublas
implicit none
DOUBLE PRECISION ALPHA,BETA
INTEGER INCX,INCY,LDA,M,N
CHARACTER TRANS
DOUBLE PRECISION, DEVICE :: A(LDA,*),X(*),Y(*)
!
call DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
!
END SUBROUTINE myDGEMV
SUBROUTINE s_psi_gpu_compatibility( lda, n, m, psi_d, spsi_d )
USE kinds, ONLY : DP

View File

@ -140,6 +140,7 @@ SUBROUTINE dndepsilon ( ipol, jpol, ldim, dns )
me_pool, nproc_pool
USE mp, ONLY : mp_sum
USE wavefunctions_module_gpum, ONLY : using_evc
USE becmod_subs_gpum, ONLY : using_becp_auto
IMPLICIT NONE
!
@ -175,6 +176,8 @@ SUBROUTINE dndepsilon ( ipol, jpol, ldim, dns )
call allocate_bec_type ( nkb,nbnd, becp )
ALLOCATE ( dns_(ldim,ldim,nspin,nat) )
!
CALL using_becp_auto(2)
!
! D_Sl for l=1 and l=2 are already initialized, for l=0 D_S0 is 1
!
! Offset of atomic wavefunctions initialized in setup and stored in offsetU
@ -280,6 +283,8 @@ SUBROUTINE dndepsilon ( ipol, jpol, ldim, dns )
DEALLOCATE ( spsi )
DEALLOCATE ( wfcatom )
CALL using_becp_auto(2)
RETURN
END SUBROUTINE dndepsilon
!
@ -490,6 +495,7 @@ SUBROUTINE dprojdepsilon_gamma ( spsi, ik, ipol, jpol, nb_s, nb_e, mykey, dproj
USE mp, ONLY : mp_sum
USE wavefunctions_module_gpum, ONLY : using_evc
USE uspp_gpum, ONLY : using_vkb, using_indv_ijkb0, using_qq_at
USE becmod_subs_gpum, ONLY : using_becp_auto
IMPLICIT NONE
!

View File

@ -34,6 +34,7 @@ SUBROUTINE stres_us( ik, gk, sigmanlc )
USE wavefunctions_module_gpum, ONLY : using_evc
USE wvfct_gpum, ONLY : using_et
USE uspp_gpum, ONLY : using_vkb, using_deeq
USE becmod_subs_gpum, ONLY : using_becp_auto
!
IMPLICIT NONE
!
@ -56,7 +57,8 @@ SUBROUTINE stres_us( ik, gk, sigmanlc )
IF ( nks > 1 ) CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb )
!
CALL allocate_bec_type ( nkb, nbnd, becp, intra_bgrp_comm )
CALL using_vkb(0)
CALL using_vkb(0); CALL using_becp_auto(2)
CALL calbec( npw, vkb, evc, becp )
!
ALLOCATE( qm1( npwx ) )
@ -81,6 +83,7 @@ SUBROUTINE stres_us( ik, gk, sigmanlc )
!
DEALLOCATE( qm1 )
CALL deallocate_bec_type ( becp )
CALL using_becp_auto(2)
!
RETURN
!

View File

@ -47,6 +47,7 @@ SUBROUTINE sum_band()
USE wavefunctions_module_gpum, ONLY : using_evc
USE wvfct_gpum, ONLY : using_et
USE uspp_gpum, ONLY : using_vkb
USE becmod_subs_gpum, ONLY : using_becp_auto
!
IMPLICIT NONE
!
@ -110,6 +111,7 @@ SUBROUTINE sum_band()
! ... Allocate (and later deallocate) arrays needed in specific cases
!
IF ( okvan ) CALL allocate_bec_type (nkb,nbnd, becp,intra_bgrp_comm)
IF ( okvan ) CALL using_becp_auto(2)
IF (dft_is_meta() .OR. lxdm) ALLOCATE (kplusg(npwx))
!
! ... specialized routines are called to sum at Gamma or for each k point
@ -132,6 +134,7 @@ SUBROUTINE sum_band()
!
IF (dft_is_meta() .OR. lxdm) DEALLOCATE (kplusg)
IF ( okvan ) CALL deallocate_bec_type ( becp )
IF ( okvan ) CALL using_becp_auto(2)
!
! ... sum charge density over pools (distributed k-points) and bands
!
@ -878,6 +881,7 @@ SUBROUTINE sum_bec ( ik, current_spin, ibnd_start, ibnd_end, this_bgrp_nbnd )
USE wavefunctions_module_gpum, ONLY : using_evc
USE wvfct_gpum, ONLY : using_et
USE uspp_gpum, ONLY : using_vkb, using_indv_ijkb0
USE becmod_subs_gpum, ONLY : using_becp_auto
!
IMPLICIT NONE
INTEGER, INTENT(IN) :: ik, current_spin, ibnd_start, ibnd_end, this_bgrp_nbnd
@ -893,6 +897,7 @@ SUBROUTINE sum_bec ( ik, current_spin, ibnd_start, ibnd_end, this_bgrp_nbnd )
CALL using_et(0)
CALL using_vkb(0)
CALL using_indv_ijkb0(0)
CALL using_becp_auto(2)
!
npw = ngk(ik)
IF ( .NOT. real_space ) THEN

View File

@ -640,6 +640,7 @@ SUBROUTINE extrapolate_wfcs( wfc_extr )
!
USE wavefunctions_module_gpum, ONLY : using_evc
USE uspp_gpum, ONLY : using_vkb
USE becmod_subs_gpum, ONLY : using_becp_auto
!
IMPLICIT NONE
!
@ -745,6 +746,7 @@ SUBROUTINE extrapolate_wfcs( wfc_extr )
!
IF ( nkb > 0 ) CALL using_vkb(1)
IF ( nkb > 0 ) CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb )
CALL using_becp_auto(2)
CALL calbec( npw, vkb, evc, becp )
!
CALL s_psi ( npwx, npw, nbnd, evc, aux )
@ -841,6 +843,7 @@ SUBROUTINE extrapolate_wfcs( wfc_extr )
DEALLOCATE( u_m, w_m, ew, aux, evcold, sp_m )
DEALLOCATE( work, rwork )
CALL deallocate_bec_type ( becp )
CALL using_becp_auto(2)
!
CLOSE( UNIT = iunoldwfc, STATUS = 'KEEP' )
IF ( wfc_extr > 2 .OR. wfc_order > 2 ) &

View File

@ -31,10 +31,13 @@ SUBROUTINE usnldiag_gpu (npw, h_diag_d, s_diag_d)
!
INTEGER, INTENT(in) :: npw
! number of plane waves
REAL(dp), DEVICE, INTENT(inout) :: h_diag_d (npwx,npol)
REAL(dp), INTENT(inout) :: h_diag_d (npwx,npol)
! the diagonal part of the hamiltonian
REAL(dp), DEVICE, INTENT(out) :: s_diag_d (npwx,npol)
REAL(dp), INTENT(out) :: s_diag_d (npwx,npol)
! the diagonal part of the S matrix
#if defined(__CUDA)
attributes(DEVICE) :: h_diag_d, s_diag_d
#endif
!
INTEGER :: ig, ipol
!

View File

@ -238,6 +238,7 @@ SUBROUTINE init_wfc ( ik )
!
USE wavefunctions_module_gpum, ONLY : using_evc
USE wvfct_gpum, ONLY : using_et
USE becmod_subs_gpum, ONLY : using_becp_auto
!
IMPLICIT NONE
!
@ -345,6 +346,7 @@ SUBROUTINE init_wfc ( ik )
! ... Allocate space for <beta|psi>
!
CALL allocate_bec_type ( nkb, n_starting_wfc, becp, intra_bgrp_comm )
CALL using_becp_auto (2)
!
! ... the following trick is for electric fields with Berry's phase:
! ... by setting lelfield = .false. one prevents the calculation of
@ -370,6 +372,7 @@ SUBROUTINE init_wfc ( ik )
et(1:nbnd,ik) = etatom(1:nbnd)
!
CALL deallocate_bec_type ( becp )
CALL using_becp_auto (2)
DEALLOCATE( etatom )
DEALLOCATE( wfcatom )
!