From 58da51262245098eb76427e42f5150efc54c4e5a Mon Sep 17 00:00:00 2001 From: fabrizio22 Date: Tue, 19 May 2020 16:32:51 +0200 Subject: [PATCH 1/7] orthoUwfc_gpu - scratch --- PW/src/make.gpu | 3 +- PW/src/orthoatwfc_gpu.f90 | 354 ++++++++++++++++++++++++++++++++++++++ PW/src/wfcinit_gpu.f90 | 2 +- 3 files changed, 357 insertions(+), 2 deletions(-) create mode 100644 PW/src/orthoatwfc_gpu.f90 diff --git a/PW/src/make.gpu b/PW/src/make.gpu index 75e61f39d..c3196d501 100644 --- a/PW/src/make.gpu +++ b/PW/src/make.gpu @@ -39,7 +39,8 @@ PWLIBS += \ stres_us_gpu.o \ compute_deff_gpu.o \ gen_us_dj_gpu.o \ - gen_us_dy_gpu.o + gen_us_dy_gpu.o \ + orthoatwfc_gpu.o MODFLAGS += \ $(MOD_FLAG)../../GScratch diff --git a/PW/src/orthoatwfc_gpu.f90 b/PW/src/orthoatwfc_gpu.f90 new file mode 100644 index 000000000..4885ad545 --- /dev/null +++ b/PW/src/orthoatwfc_gpu.f90 @@ -0,0 +1,354 @@ +! +! Copyright (C) 2001-2020 Quantum ESPRESSO group +! This file is distributed under the terms of the +! GNU General Public License. See the file `License' +! in the root directory of the present distribution, +! or http://www.gnu.org/copyleft/gpl.txt . +! +! +!----------------------------------------------------------------------- +SUBROUTINE orthoUwfc_gpu + !----------------------------------------------------------------------- + ! + ! This routine saves to buffer "iunhub" atomic wavefunctions having an + ! associated Hubbard U term * S, for DFT+U(+V) calculations. Same for + ! "iunhub2" but without S (this is then used to computed Hubbard forces + ! and stresses). Atomic wavefunctions + ! are orthogonalized if desired, depending upon the value of "U_projection" + ! "swfcatom" must NOT be allocated on input. + ! + USE kinds, ONLY : DP + USE buffers, ONLY : get_buffer, save_buffer + USE io_global, ONLY : stdout + USE io_files, ONLY : iunhub, iunhub2, nwordwfcU + USE ions_base, ONLY : nat + USE basis, ONLY : natomwfc, swfcatom + USE klist, ONLY : nks, xk, ngk, igk_k, igk_k_d + USE ldaU, ONLY : U_projection, wfcU, nwfcU, copy_U_wfc + USE wvfct, ONLY : npwx + USE uspp, ONLY : nkb, vkb + USE becmod, ONLY : allocate_bec_type, deallocate_bec_type, & + bec_type, becp, calbec + USE control_flags, ONLY : gamma_only + USE noncollin_module, ONLY : noncolin, npol + USE mp_bands, ONLY : use_bgrp_in_hpsi + ! + USE uspp_gpum, ONLY : using_vkb, using_vkb_d, vkb_d + USE becmod_gpum, ONLY : becp_d + USE becmod_subs_gpum, ONLY : using_becp_auto, using_becp_d_auto, & + calbec_gpu + ! + IMPLICIT NONE + ! + ! + INTEGER :: ik, ibnd, info, i, j, k, na, nb, nt, isym, n, ntemp, m, & + l, lm, ltot, ntot, ipol, npw + ! ik: the k point under consideration + ! ibnd: counter on bands + LOGICAL :: orthogonalize_wfc, normalize_only, save_flag + COMPLEX(DP) , ALLOCATABLE :: wfcatom (:,:) + + !----gpu + COMPLEX(DP) , ALLOCATABLE, DEVICE :: wfcatom_d(:,:) + COMPLEX(DP) , ALLOCATABLE, DEVICE :: swfcatom_d(:,:) + !--- + + IF ( U_projection == "pseudo" ) THEN + WRITE( stdout,*) 'Beta functions used for LDA+U Projector' + RETURN + ELSE IF (U_projection=="file") THEN + ! + ! Read atomic wavefunctions from file (produced by pmw.x). In this case, + ! U-specific atomic wavefunctions wfcU coincide with atomic wavefunctions + ! + WRITE( stdout,*) 'LDA+U Projector read from file ' + DO ik = 1, nks + CALL get_buffer(wfcU, nwordwfcU, iunhub, ik) + END DO + RETURN + ELSE IF (U_projection=="atomic") THEN + orthogonalize_wfc = .FALSE. + normalize_only = .FALSE. + WRITE( stdout,*) 'Atomic wfc used for LDA+U Projector are NOT orthogonalized' + ELSE IF (U_projection=="ortho-atomic") THEN + orthogonalize_wfc = .TRUE. + normalize_only = .FALSE. + WRITE( stdout,*) 'Atomic wfc used for LDA+U Projector are orthogonalized' + IF (gamma_only) CALL errore('orthoatwfc', & + 'Gamma-only calculation for this case not implemented', 1 ) + ELSE IF (U_projection=="norm-atomic") THEN + orthogonalize_wfc = .TRUE. + normalize_only = .TRUE. + WRITE( stdout,*) 'Atomic wfc used for LDA+U Projector are normalized but NOT orthogonalized' + IF (gamma_only) CALL errore('orthoatwfc', & + 'Gamma-only calculation for this case not implemented', 1 ) + ELSE + WRITE( stdout,*) "U_projection_type =", U_projection + CALL errore ("orthoatwfc"," this U_projection_type is not valid",1) + END IF + + ALLOCATE( wfcatom(npwx*npol,natomwfc), swfcatom(npwx*npol,natomwfc) ) + + !-----gpu + ALLOCATE( wfcatom_d(npwx*npol,natomwfc), swfcatom_d(npwx*npol,natomwfc) ) !.. + !---- + + save_flag = use_bgrp_in_hpsi ; use_bgrp_in_hpsi=.false. + + ! Allocate the array becp = + CALL allocate_bec_type( nkb, natomwfc, becp ) + CALL using_becp_auto(2) + ! + ! + DO ik = 1, nks + ! + IF (noncolin) THEN + CALL atomic_wfc_nc_updown (ik, wfcatom) + ELSE + CALL atomic_wfc (ik, wfcatom) ! --> da fare in gpu dopo + !--- + wfcatom_d = wfcatom + !---- + ENDIF + npw = ngk(ik) + !CALL using_vkb(1) + CALL using_vkb_d(2) + CALL init_us_2_gpu( npw, igk_k_d(1,ik), xk(1,ik), vkb_d ) + + + CALL using_becp_d_auto(2) + CALL calbec_gpu(npw, vkb_d, wfcatom_d, becp_d) + + + CALL s_psi_gpu( npwx, npw, natomwfc, wfcatom_d, swfcatom_d ) + + !--- + wfcatom = wfcatom_d + swfcatom = swfcatom_d + !--- + + IF (orthogonalize_wfc) & ! --questa è da convertire qui sotto + CALL ortho_swfc ( npw, normalize_only, natomwfc, wfcatom, swfcatom, .TRUE. ) + ! + ! copy atomic wavefunctions with Hubbard U term only in wfcU + ! (this is then used to compute Hubbard forces and stresses) + ! save to unit iunhub2 + ! + CALL copy_U_wfc (wfcatom, noncolin) + CALL save_buffer (wfcU, nwordwfcU, iunhub2, ik) + ! + ! copy S * atomic wavefunctions with Hubbard U term only in wfcU + ! (this is used during the self-consistent solution of Kohn-Sham equations) + ! save to unit iunhub + ! + CALL copy_U_wfc (swfcatom, noncolin) + IF ( nks > 1 ) & + CALL save_buffer (wfcU, nwordwfcU, iunhub, ik) + ! + ENDDO + DEALLOCATE (wfcatom, swfcatom) + CALL deallocate_bec_type ( becp ) + CALL using_becp_auto(2) + use_bgrp_in_hpsi = save_flag + ! + RETURN + +END SUBROUTINE orthoUwfc_gpu +! +!----------------------------------------------------------------------- +SUBROUTINE orthoatwfc_gpu( orthogonalize_wfc ) + !----------------------------------------------------------------------- + ! + ! This routine calculates atomic wavefunctions, orthogonalizes them + ! if "orthogonalzie_wfc" is .true., saves them into buffer "iunsat". + ! "swfcatom" must be allocated on input. + ! Useful for options "wannier" and "one_atom_occupations" + ! + USE kinds, ONLY : DP + USE buffers, ONLY : save_buffer + USE io_global, ONLY : stdout + USE io_files, ONLY : iunsat, nwordatwfc + USE ions_base, ONLY : nat + USE basis, ONLY : natomwfc, swfcatom + USE klist, ONLY : nks, xk, ngk, igk_k + USE wvfct, ONLY : npwx + USE uspp, ONLY : nkb, vkb + USE becmod, ONLY : allocate_bec_type, deallocate_bec_type, & + bec_type, becp, calbec + USE control_flags, ONLY : gamma_only + USE noncollin_module, ONLY : noncolin, npol + ! + USE uspp_gpum, ONLY : using_vkb + ! + IMPLICIT NONE + ! + LOGICAL, INTENT(in) :: orthogonalize_wfc + ! + INTEGER :: ik, ibnd, info, i, j, k, na, nb, nt, isym, n, ntemp, m, & + l, lm, ltot, ntot, ipol, npw + ! ik: the k point under consideration + ! ibnd: counter on bands + LOGICAL :: normalize_only = .FALSE. + COMPLEX(DP) , ALLOCATABLE :: wfcatom (:,:) + + normalize_only=.FALSE. + ALLOCATE (wfcatom( npwx*npol, natomwfc)) + + ! Allocate the array becp = + CALL allocate_bec_type (nkb,natomwfc, becp) + + DO ik = 1, nks + + IF (noncolin) THEN + CALL atomic_wfc_nc_updown (ik, wfcatom) + ELSE + CALL atomic_wfc (ik, wfcatom) + ENDIF + npw = ngk (ik) + CALL using_vkb(1) + CALL init_us_2 (npw, igk_k(1,ik), xk (1, ik), vkb) + CALL calbec (npw, vkb, wfcatom, becp) + CALL s_psi (npwx, npw, natomwfc, wfcatom, swfcatom) + + IF (orthogonalize_wfc) & + CALL ortho_swfc ( npw, normalize_only, natomwfc, wfcatom, swfcatom, .FALSE. ) + ! + ! write S * atomic wfc to unit iunsat + ! + CALL save_buffer (swfcatom, nwordatwfc, iunsat, ik) + ! + ENDDO + DEALLOCATE (wfcatom) + CALL deallocate_bec_type ( becp ) + ! + RETURN + +END SUBROUTINE orthoatwfc_gpu +! +!----------------------------------------------------------------------- +SUBROUTINE ortho_swfc_gpu( npw, normalize_only, m, wfc, swfc, lflag ) + !----------------------------------------------------------------------- + ! + ! On input : wfc (npwx*npol,m) = \psi = a set of "m" (atomic) wavefcts + ! swfc(npwx*npol,m) = S\psi + ! normalize_only = only normalize, do not orthonormalize + ! + ! This routine will compute the overlap matrix O: + ! O_ij = = + ! + ! On output: swfc = O^{-1/2} S\psi, i.e. S * orthonormalized wavefunctions + ! If lflag=.FALSE. : wfc are unchanged on output (not orthonormalized), i.e. + ! wfc = \psi + ! If lflag=.TRUE. : wfc are orthonormalized on output, i.e. + ! wfc = O^{-1/2} \psi, = \delta_{ij} + ! + USE kinds, ONLY : DP + USE wvfct, ONLY : npwx + USE mp_bands, ONLY : intra_bgrp_comm + USE mp, ONLY : mp_sum + USE noncollin_module, ONLY : noncolin, npol + IMPLICIT NONE + ! + INTEGER, INTENT(IN) :: m, npw + LOGICAL, INTENT(IN) :: normalize_only + COMPLEX(dp), INTENT(INOUT) :: wfc (npwx*npol,m) + COMPLEX(dp), INTENT(INOUT) :: swfc(npwx*npol,m) + LOGICAL, INTENT(IN) :: lflag + + COMPLEX(DP) :: temp + COMPLEX(DP) , ALLOCATABLE :: work (:,:), overlap (:,:) + REAL(DP) , ALLOCATABLE :: e (:) + INTEGER :: i, j, k, ipol + + ALLOCATE (overlap( m , m)) + ALLOCATE (work ( m , m)) + ALLOCATE (e ( m)) + + overlap(:,:) = (0.d0,0.d0) + work(:,:) = (0.d0,0.d0) + ! + ! calculate overlap matrix + ! + IF (noncolin) THEN + CALL zgemm ('c', 'n', m, m, npwx*npol, (1.d0, 0.d0), wfc, & + npwx*npol, swfc, npwx*npol, (0.d0,0.d0), overlap, m) + ELSE + CALL zgemm ('c', 'n', m, m, npw, (1.d0, 0.d0), wfc, & + npwx, swfc, npwx, (0.d0, 0.d0), overlap, m) + END IF + ! + CALL mp_sum( overlap, intra_bgrp_comm ) + ! + IF ( normalize_only ) THEN + DO i = 1, m + DO j = i+1, m + overlap(i,j) = CMPLX(0.d0,0.d0, kind=dp) + overlap(j,i) = CMPLX(0.d0,0.d0, kind=dp) + ENDDO + ENDDO + END IF + ! + ! find O^(-1/2) + ! + CALL cdiagh (m, overlap, m, e, work) + DO i = 1, m + e (i) = 1.d0 / SQRT (e (i) ) + ENDDO + DO i = 1, m + DO j = i, m + temp = (0.d0, 0.d0) + DO k = 1, m + temp = temp + e (k) * work (j, k) * CONJG (work (i, k) ) + ENDDO + overlap (i, j) = temp + IF (j.NE.i) overlap (j, i) = CONJG (temp) + ENDDO + ENDDO + ! + ! transform atomic orbitals O^(-1/2) S\psi + ! FIXME: can be done in a faster way by using wfc as work space + ! + DO i = 1, npw + work(:,1) = (0.d0,0.d0) + IF (noncolin) THEN + DO ipol=1,npol + j = i + (ipol-1)*npwx + CALL zgemv ('n',m,m,(1.d0,0.d0),overlap, & + m, swfc(j,1), npwx*npol, (0.d0,0.d0),work,1) + CALL zcopy (m,work,1,swfc(j,1),npwx*npol) + END DO + ELSE + CALL zgemv ('n', m, m, (1.d0, 0.d0) , overlap, & + m, swfc (i, 1) , npwx, (0.d0, 0.d0) , work, 1) + CALL zcopy (m, work, 1, swfc (i, 1), npwx) + END IF + ENDDO + ! + ! If lflag=.TRUE. transform atomic orbitals without + ! the ultrasoft S operator O^(-1/2) \psi + ! + IF (lflag) THEN + DO i = 1, npw + work(:,1) = (0.d0,0.d0) + IF (noncolin) THEN + DO ipol=1,npol + j = i + (ipol-1)*npwx + CALL zgemv ('n',m,m,(1.d0,0.d0),overlap, & + m, wfc(j,1), npwx*npol, (0.d0,0.d0),work,1) + CALL zcopy (m,work,1,wfc(j,1),npwx*npol) + END DO + ELSE + CALL zgemv ('n', m, m, (1.d0, 0.d0) , overlap, & + m, wfc (i, 1) , npwx, (0.d0, 0.d0) , work, 1) + CALL zcopy (m, work, 1, wfc (i, 1), npwx) + END IF + ENDDO + ENDIF + ! + DEALLOCATE (overlap) + DEALLOCATE (work) + DEALLOCATE (e) + ! + RETURN + ! +END SUBROUTINE ortho_swfc_gpu diff --git a/PW/src/wfcinit_gpu.f90 b/PW/src/wfcinit_gpu.f90 index 3cb3ba7bb..a12780c8e 100644 --- a/PW/src/wfcinit_gpu.f90 +++ b/PW/src/wfcinit_gpu.f90 @@ -52,7 +52,7 @@ SUBROUTINE wfcinit_gpu() ! ... Orthogonalized atomic functions needed for DFT+U and other cases ! IF ( use_wannier .OR. one_atom_occupations ) CALL orthoatwfc ( use_wannier ) - IF ( lda_plus_u ) CALL orthoUwfc() + IF ( lda_plus_u ) CALL orthoUwfc_gpu() ! ! ... open files/buffer for wavefunctions (nwordwfc set in openfil) ! ... io_level > 1 : open file, otherwise: open buffer From 473d89b5ca73cdafb5d1747e83d7b6073c42f098 Mon Sep 17 00:00:00 2001 From: fabrizio22 Date: Wed, 20 May 2020 12:56:44 +0200 Subject: [PATCH 2/7] ortho_swfc_gpu - scratch --- PW/src/orthoatwfc_gpu.f90 | 117 ++++++++++++++++++++++++++------------ 1 file changed, 81 insertions(+), 36 deletions(-) diff --git a/PW/src/orthoatwfc_gpu.f90 b/PW/src/orthoatwfc_gpu.f90 index 4885ad545..ae8b0f95c 100644 --- a/PW/src/orthoatwfc_gpu.f90 +++ b/PW/src/orthoatwfc_gpu.f90 @@ -6,6 +6,10 @@ ! or http://www.gnu.org/copyleft/gpl.txt . ! ! +#if !defined(__CUDA) +#define cublasZgemm zgemm +#endif +! !----------------------------------------------------------------------- SUBROUTINE orthoUwfc_gpu !----------------------------------------------------------------------- @@ -92,7 +96,6 @@ SUBROUTINE orthoUwfc_gpu !-----gpu ALLOCATE( wfcatom_d(npwx*npol,natomwfc), swfcatom_d(npwx*npol,natomwfc) ) !.. !---- - save_flag = use_bgrp_in_hpsi ; use_bgrp_in_hpsi=.false. ! Allocate the array becp = @@ -106,19 +109,18 @@ SUBROUTINE orthoUwfc_gpu CALL atomic_wfc_nc_updown (ik, wfcatom) ELSE CALL atomic_wfc (ik, wfcatom) ! --> da fare in gpu dopo - !--- - wfcatom_d = wfcatom - !---- ENDIF + !--- + wfcatom_d = wfcatom + !---- + npw = ngk(ik) !CALL using_vkb(1) CALL using_vkb_d(2) CALL init_us_2_gpu( npw, igk_k_d(1,ik), xk(1,ik), vkb_d ) - CALL using_becp_d_auto(2) - CALL calbec_gpu(npw, vkb_d, wfcatom_d, becp_d) - + CALL calbec_gpu( npw, vkb_d, wfcatom_d, becp_d ) CALL s_psi_gpu( npwx, npw, natomwfc, wfcatom_d, swfcatom_d ) @@ -127,15 +129,15 @@ SUBROUTINE orthoUwfc_gpu swfcatom = swfcatom_d !--- - IF (orthogonalize_wfc) & ! --questa è da convertire qui sotto - CALL ortho_swfc ( npw, normalize_only, natomwfc, wfcatom, swfcatom, .TRUE. ) + !IF (orthogonalize_wfc) & ! --questa è da convertire qui sotto + CALL ortho_swfc_gpu( npw, normalize_only, natomwfc, wfcatom, swfcatom, .TRUE. ) ! ! copy atomic wavefunctions with Hubbard U term only in wfcU ! (this is then used to compute Hubbard forces and stresses) ! save to unit iunhub2 ! - CALL copy_U_wfc (wfcatom, noncolin) - CALL save_buffer (wfcU, nwordwfcU, iunhub2, ik) + CALL copy_U_wfc( wfcatom, noncolin ) + CALL save_buffer( wfcU, nwordwfcU, iunhub2, ik ) ! ! copy S * atomic wavefunctions with Hubbard U term only in wfcU ! (this is used during the self-consistent solution of Kohn-Sham equations) @@ -242,6 +244,11 @@ SUBROUTINE ortho_swfc_gpu( npw, normalize_only, m, wfc, swfc, lflag ) ! If lflag=.TRUE. : wfc are orthonormalized on output, i.e. ! wfc = O^{-1/2} \psi, = \delta_{ij} ! +#if defined(__CUDA) + USE cudafor + USE cublas +#endif + ! USE kinds, ONLY : DP USE wvfct, ONLY : npwx USE mp_bands, ONLY : intra_bgrp_comm @@ -256,55 +263,95 @@ SUBROUTINE ortho_swfc_gpu( npw, normalize_only, m, wfc, swfc, lflag ) LOGICAL, INTENT(IN) :: lflag COMPLEX(DP) :: temp - COMPLEX(DP) , ALLOCATABLE :: work (:,:), overlap (:,:) - REAL(DP) , ALLOCATABLE :: e (:) + COMPLEX(DP), ALLOCATABLE :: work(:,:) + + REAL(DP) , ALLOCATABLE :: e(:) + COMPLEX(DP), ALLOCATABLE :: overlap(:,:) + + COMPLEX(DP), ALLOCATABLE, DEVICE :: overlap_d(:,:) + COMPLEX(DP), ALLOCATABLE, DEVICE :: wfc_d(:,:) + COMPLEX(DP), ALLOCATABLE, DEVICE :: swfc_d(:,:) + COMPLEX(DP), ALLOCATABLE, DEVICE :: work_d(:,:) + REAL(DP) , ALLOCATABLE, DEVICE :: e_d(:) INTEGER :: i, j, k, ipol - ALLOCATE (overlap( m , m)) - ALLOCATE (work ( m , m)) - ALLOCATE (e ( m)) - overlap(:,:) = (0.d0,0.d0) + ALLOCATE( overlap(m,m) ) + ALLOCATE( work(m,m) ) + ALLOCATE( e(m) ) + + + !---gpu + ALLOCATE( overlap_d(m,m) ) + ALLOCATE( wfc_d(npwx*npol,m) ) + ALLOCATE( swfc_d(npwx*npol,m) ) + ALLOCATE( work_d(m,m) ) + ALLOCATE( e_d(m) ) + wfc_d = wfc + swfc_d = swfc + !--- + + print *, 'ososososososo2' + + ! + overlap_d(:,:) = (0.d0,0.d0) work(:,:) = (0.d0,0.d0) ! ! calculate overlap matrix ! IF (noncolin) THEN - CALL zgemm ('c', 'n', m, m, npwx*npol, (1.d0, 0.d0), wfc, & - npwx*npol, swfc, npwx*npol, (0.d0,0.d0), overlap, m) + CALL cublasZgemm( 'C', 'N', m, m, npwx*npol, (1.d0,0.d0), wfc_d, & + npwx*npol, swfc_d, npwx*npol, (0.d0,0.d0), overlap_d, m ) ELSE - CALL zgemm ('c', 'n', m, m, npw, (1.d0, 0.d0), wfc, & - npwx, swfc, npwx, (0.d0, 0.d0), overlap, m) + CALL cublasZgemm( 'C', 'N', m, m, npw, (1.d0,0.d0), wfc_d, & + npwx, swfc_d, npwx, (0.d0,0.d0), overlap_d, m ) END IF ! - CALL mp_sum( overlap, intra_bgrp_comm ) + CALL mp_sum( overlap_d, intra_bgrp_comm ) ! IF ( normalize_only ) THEN + !$cuf kernel do (1) <<<*,*>>> DO i = 1, m DO j = i+1, m - overlap(i,j) = CMPLX(0.d0,0.d0, kind=dp) - overlap(j,i) = CMPLX(0.d0,0.d0, kind=dp) + overlap_d(i,j) = CMPLX(0.d0,0.d0, kind=dp) + overlap_d(j,i) = CMPLX(0.d0,0.d0, kind=dp) ENDDO ENDDO END IF ! ! find O^(-1/2) ! - CALL cdiagh (m, overlap, m, e, work) + + overlap = overlap_d + + CALL cdiagh( m, overlap, m, e, work ) !_gpu + + e_d = e + work_d = work + + !$cuf kernel do (1) <<<*,*>>> DO i = 1, m - e (i) = 1.d0 / SQRT (e (i) ) + e_d(i) = 1.d0 / SQRT(e_d(i)) ENDDO + !$cuf kernel do (1) <<<*,*>>> DO i = 1, m DO j = i, m temp = (0.d0, 0.d0) DO k = 1, m - temp = temp + e (k) * work (j, k) * CONJG (work (i, k) ) + temp = temp + e_d(k) * work_d(j,k) * CONJG(work_d(i,k) ) ENDDO - overlap (i, j) = temp - IF (j.NE.i) overlap (j, i) = CONJG (temp) + overlap_d(i,j) = temp + IF (j /= i) overlap_d(j,i) = CONJG(temp) ENDDO ENDDO ! + + + overlap = overlap_d + e = e_d + + DEALLOCATE( wfc_d, swfc_d, overlap_d, work_d, e_d ) + ! transform atomic orbitals O^(-1/2) S\psi ! FIXME: can be done in a faster way by using wfc as work space ! @@ -313,14 +360,14 @@ SUBROUTINE ortho_swfc_gpu( npw, normalize_only, m, wfc, swfc, lflag ) IF (noncolin) THEN DO ipol=1,npol j = i + (ipol-1)*npwx - CALL zgemv ('n',m,m,(1.d0,0.d0),overlap, & + CALL zgemv('n',m,m,(1.d0,0.d0),overlap, & m, swfc(j,1), npwx*npol, (0.d0,0.d0),work,1) - CALL zcopy (m,work,1,swfc(j,1),npwx*npol) + CALL zcopy(m,work,1,swfc(j,1),npwx*npol) END DO ELSE - CALL zgemv ('n', m, m, (1.d0, 0.d0) , overlap, & + CALL zgemv('n', m, m, (1.d0, 0.d0) , overlap, & m, swfc (i, 1) , npwx, (0.d0, 0.d0) , work, 1) - CALL zcopy (m, work, 1, swfc (i, 1), npwx) + CALL zcopy(m, work, 1, swfc (i, 1), npwx) END IF ENDDO ! @@ -345,9 +392,7 @@ SUBROUTINE ortho_swfc_gpu( npw, normalize_only, m, wfc, swfc, lflag ) ENDDO ENDIF ! - DEALLOCATE (overlap) - DEALLOCATE (work) - DEALLOCATE (e) + DEALLOCATE( wfc_d, swfc_d ) ! RETURN ! From ac7c155c02027cccd3cae5ec87bcbe9c34fa7c9f Mon Sep 17 00:00:00 2001 From: fabrizio22 Date: Thu, 21 May 2020 17:18:36 +0200 Subject: [PATCH 3/7] ortho_swfc_gpu complete --- PW/src/orthoatwfc_gpu.f90 | 129 ++++++++++++++++---------------------- 1 file changed, 54 insertions(+), 75 deletions(-) diff --git a/PW/src/orthoatwfc_gpu.f90 b/PW/src/orthoatwfc_gpu.f90 index ae8b0f95c..496b31912 100644 --- a/PW/src/orthoatwfc_gpu.f90 +++ b/PW/src/orthoatwfc_gpu.f90 @@ -121,16 +121,15 @@ SUBROUTINE orthoUwfc_gpu CALL using_becp_d_auto(2) CALL calbec_gpu( npw, vkb_d, wfcatom_d, becp_d ) - + ! CALL s_psi_gpu( npwx, npw, natomwfc, wfcatom_d, swfcatom_d ) - - !--- + ! + IF (orthogonalize_wfc) CALL ortho_swfc_gpu( npw, normalize_only, & + natomwfc, wfcatom_d, swfcatom_d, .TRUE. ) + ! wfcatom = wfcatom_d swfcatom = swfcatom_d - !--- - - !IF (orthogonalize_wfc) & ! --questa è da convertire qui sotto - CALL ortho_swfc_gpu( npw, normalize_only, natomwfc, wfcatom, swfcatom, .TRUE. ) + ! ! ! copy atomic wavefunctions with Hubbard U term only in wfcU ! (this is then used to compute Hubbard forces and stresses) @@ -228,7 +227,7 @@ SUBROUTINE orthoatwfc_gpu( orthogonalize_wfc ) END SUBROUTINE orthoatwfc_gpu ! !----------------------------------------------------------------------- -SUBROUTINE ortho_swfc_gpu( npw, normalize_only, m, wfc, swfc, lflag ) +SUBROUTINE ortho_swfc_gpu( npw, normalize_only, m, wfc_d, swfc_d, lflag ) !----------------------------------------------------------------------- ! ! On input : wfc (npwx*npol,m) = \psi = a set of "m" (atomic) wavefcts @@ -245,57 +244,42 @@ SUBROUTINE ortho_swfc_gpu( npw, normalize_only, m, wfc, swfc, lflag ) ! wfc = O^{-1/2} \psi, = \delta_{ij} ! #if defined(__CUDA) - USE cudafor USE cublas -#endif +#endif ! USE kinds, ONLY : DP USE wvfct, ONLY : npwx - USE mp_bands, ONLY : intra_bgrp_comm + USE mp_bands, ONLY : intra_bgrp_comm, me_bgrp, root_bgrp USE mp, ONLY : mp_sum USE noncollin_module, ONLY : noncolin, npol + ! IMPLICIT NONE ! INTEGER, INTENT(IN) :: m, npw LOGICAL, INTENT(IN) :: normalize_only - COMPLEX(dp), INTENT(INOUT) :: wfc (npwx*npol,m) - COMPLEX(dp), INTENT(INOUT) :: swfc(npwx*npol,m) + COMPLEX(DP), INTENT(INOUT) :: wfc_d(npwx*npol,m) + COMPLEX(DP), INTENT(INOUT) :: swfc_d(npwx*npol,m) LOGICAL, INTENT(IN) :: lflag - - COMPLEX(DP) :: temp - COMPLEX(DP), ALLOCATABLE :: work(:,:) - - REAL(DP) , ALLOCATABLE :: e(:) - COMPLEX(DP), ALLOCATABLE :: overlap(:,:) - - COMPLEX(DP), ALLOCATABLE, DEVICE :: overlap_d(:,:) - COMPLEX(DP), ALLOCATABLE, DEVICE :: wfc_d(:,:) - COMPLEX(DP), ALLOCATABLE, DEVICE :: swfc_d(:,:) - COMPLEX(DP), ALLOCATABLE, DEVICE :: work_d(:,:) - REAL(DP) , ALLOCATABLE, DEVICE :: e_d(:) + ! + ! ... local variables + ! INTEGER :: i, j, k, ipol - - - ALLOCATE( overlap(m,m) ) - ALLOCATE( work(m,m) ) - ALLOCATE( e(m) ) - - - !---gpu + COMPLEX(DP) :: temp + ! + COMPLEX(DP), ALLOCATABLE :: overlap_d(:,:) + COMPLEX(DP), ALLOCATABLE :: work_d(:,:), s_d(:,:) + REAL(DP) , ALLOCATABLE :: e_d(:) + ! +#if defined(__CUDA) + attributes(DEVICE) :: wfc_d, swfc_d, overlap_d, work_d, s_d, e_d +#endif + ! ALLOCATE( overlap_d(m,m) ) - ALLOCATE( wfc_d(npwx*npol,m) ) - ALLOCATE( swfc_d(npwx*npol,m) ) - ALLOCATE( work_d(m,m) ) + ALLOCATE( work_d(m,m), s_d(m,m) ) ALLOCATE( e_d(m) ) - wfc_d = wfc - swfc_d = swfc - !--- - - print *, 'ososososososo2' - ! overlap_d(:,:) = (0.d0,0.d0) - work(:,:) = (0.d0,0.d0) + work_d(:,:) = (0.d0,0.d0) ! ! calculate overlap matrix ! @@ -321,14 +305,15 @@ SUBROUTINE ortho_swfc_gpu( npw, normalize_only, m, wfc, swfc, lflag ) ! ! find O^(-1/2) ! - - overlap = overlap_d - - CALL cdiagh( m, overlap, m, e, work ) !_gpu - - e_d = e - work_d = work - + s_d = CMPLX(0.d0,0.d0, kind=dp) + !$cuf kernel do (1) + DO i = 1, m + s_d(i,i) = CMPLX(1.d0,0.d0, kind=dp) + ENDDO + ! + CALL laxlib_cdiaghg_gpu( m, m, overlap_d, s_d, m, e_d, work_d, me_bgrp, & + root_bgrp, intra_bgrp_comm ) + ! !$cuf kernel do (1) <<<*,*>>> DO i = 1, m e_d(i) = 1.d0 / SQRT(e_d(i)) @@ -338,36 +323,29 @@ SUBROUTINE ortho_swfc_gpu( npw, normalize_only, m, wfc, swfc, lflag ) DO j = i, m temp = (0.d0, 0.d0) DO k = 1, m - temp = temp + e_d(k) * work_d(j,k) * CONJG(work_d(i,k) ) + temp = temp + e_d(k) * work_d(j,k) * CONJG(work_d(i,k)) ENDDO overlap_d(i,j) = temp IF (j /= i) overlap_d(j,i) = CONJG(temp) ENDDO ENDDO ! - - - overlap = overlap_d - e = e_d - - DEALLOCATE( wfc_d, swfc_d, overlap_d, work_d, e_d ) - ! transform atomic orbitals O^(-1/2) S\psi ! FIXME: can be done in a faster way by using wfc as work space ! DO i = 1, npw - work(:,1) = (0.d0,0.d0) + work_d(:,1) = (0.d0,0.d0) IF (noncolin) THEN DO ipol=1,npol j = i + (ipol-1)*npwx - CALL zgemv('n',m,m,(1.d0,0.d0),overlap, & - m, swfc(j,1), npwx*npol, (0.d0,0.d0),work,1) - CALL zcopy(m,work,1,swfc(j,1),npwx*npol) + CALL cublasZgemv( 'N', m, m, (1.d0,0.d0), overlap_d, & + m, swfc_d(j,1), npwx*npol, (0.d0,0.d0), work_d, 1 ) + CALL cublasZcopy( m, work_d, 1, swfc_d(j,1), npwx*npol ) END DO ELSE - CALL zgemv('n', m, m, (1.d0, 0.d0) , overlap, & - m, swfc (i, 1) , npwx, (0.d0, 0.d0) , work, 1) - CALL zcopy(m, work, 1, swfc (i, 1), npwx) + CALL cublasZgemv( 'N', m, m, (1.d0,0.d0), overlap_d, & + m, swfc_d(i,1), npwx, (0.d0,0.d0), work_d, 1 ) + CALL cublasZcopy( m, work_d, 1, swfc_d(i,1), npwx ) END IF ENDDO ! @@ -376,23 +354,24 @@ SUBROUTINE ortho_swfc_gpu( npw, normalize_only, m, wfc, swfc, lflag ) ! IF (lflag) THEN DO i = 1, npw - work(:,1) = (0.d0,0.d0) + work_d(:,1) = (0.d0,0.d0) IF (noncolin) THEN - DO ipol=1,npol + DO ipol = 1, npol j = i + (ipol-1)*npwx - CALL zgemv ('n',m,m,(1.d0,0.d0),overlap, & - m, wfc(j,1), npwx*npol, (0.d0,0.d0),work,1) - CALL zcopy (m,work,1,wfc(j,1),npwx*npol) + CALL cublasZgemv( 'N', m, m, (1.d0,0.d0), overlap_d, & + m, wfc_d(j,1), npwx*npol, (0.d0,0.d0), work_d, 1 ) + CALL cublasZcopy( m, work_d, 1, wfc_d(j,1), npwx*npol ) END DO ELSE - CALL zgemv ('n', m, m, (1.d0, 0.d0) , overlap, & - m, wfc (i, 1) , npwx, (0.d0, 0.d0) , work, 1) - CALL zcopy (m, work, 1, wfc (i, 1), npwx) + CALL cublasZgemv( 'N', m, m, (1.d0,0.d0), overlap_d, & + m, wfc_d(i,1), npwx, (0.d0,0.d0), work_d, 1 ) + CALL cublasZcopy( m, work_d, 1, wfc_d(i,1), npwx ) END IF ENDDO ENDIF ! - DEALLOCATE( wfc_d, swfc_d ) + DEALLOCATE( overlap_d, s_d ) + DEALLOCATE( work_d, e_d ) ! RETURN ! From 07f33654496e53c0030a3596122e2ec4dee4856d Mon Sep 17 00:00:00 2001 From: fabrizio22 Date: Thu, 21 May 2020 20:01:24 +0200 Subject: [PATCH 4/7] atomic_wfc_gpu --- PW/src/atomic_wfc_gpu.f90 | 411 ++++++++++++++++++++++++++++++++++++++ PW/src/make.gpu | 3 +- PW/src/orthoatwfc_gpu.f90 | 53 +++-- 3 files changed, 439 insertions(+), 28 deletions(-) create mode 100644 PW/src/atomic_wfc_gpu.f90 diff --git a/PW/src/atomic_wfc_gpu.f90 b/PW/src/atomic_wfc_gpu.f90 new file mode 100644 index 000000000..bda7b7875 --- /dev/null +++ b/PW/src/atomic_wfc_gpu.f90 @@ -0,0 +1,411 @@ +! +! Copyright (C) 2001-2007 Quantum ESPRESSO group +! This file is distributed under the terms of the +! GNU General Public License. See the file `License' +! in the root directory of the present distribution, +! or http://www.gnu.org/copyleft/gpl.txt . +! +! +!----------------------------------------------------------------------- +SUBROUTINE atomic_wfc_gpu( ik, wfcatom_d ) + !----------------------------------------------------------------------- + !! This routine computes the superposition of atomic wavefunctions + !! for k-point "ik" - output in "wfcatom". + ! + USE kinds, ONLY : DP + USE constants, ONLY : tpi, fpi, pi + USE cell_base, ONLY : omega, tpiba + USE ions_base, ONLY : nat, ntyp => nsp, ityp, tau + USE basis, ONLY : natomwfc + !USE gvect, ONLY : mill, eigts1, eigts2, eigts3, g + USE klist, ONLY : xk, ngk, igk_k_d !, igk_k + USE wvfct, ONLY : npwx + USE us, ONLY : tab_at, dq + USE uspp_param, ONLY : upf + USE noncollin_module, ONLY : noncolin, npol, angle1, angle2 + USE spin_orb, ONLY : lspinorb, rot_ylm, fcoef, lmaxx, domag, & + starting_spin_angle + USE mp_bands, ONLY : inter_bgrp_comm + USE mp, ONLY : mp_sum + ! + USE gvect_gpum, ONLY : mill_d, eigts1_d, eigts2_d, eigts3_d, g_d + USE us_gpum, ONLY : using_tab_at, using_tab_at_d, tab_at_d + ! + IMPLICIT NONE + ! + INTEGER, INTENT(IN) :: ik + !! k-point index + COMPLEX(DP), INTENT(OUT) :: wfcatom_d(npwx,npol,natomwfc) + !! Superposition of atomic wavefunctions + ! + ! ... local variables + ! + INTEGER :: n_starting_wfc, lmax_wfc, nt, l, nb, na, m, lm, ig, iig, & + i0, i1, i2, i3, nwfcm, npw + COMPLEX(DP), ALLOCATABLE :: aux(:) + COMPLEX(DP) :: kphase, lphase + REAL(DP) :: arg, px, ux, vx, wx + INTEGER :: ig_start, ig_end, mil1, mil2, mil3 + ! + REAL(DP) :: xk1, xk2, xk3, qgr + REAL(DP), ALLOCATABLE :: chiq_d(:,:,:) + REAL(DP), ALLOCATABLE :: ylm_d(:,:), gk_d(:,:), qg_d(:) + COMPLEX(DP), ALLOCATABLE :: sk_d(:) + ! +#if defined(__CUDA) + attributes(DEVICE) :: wfcatom_d, ylm_d, gk_d, qg_d, sk_d, chiq_d +#endif + ! + CALL start_clock( 'atomic_wfc' ) + + ! calculate max angular momentum required in wavefunctions + lmax_wfc = 0 + DO nt = 1, ntyp + lmax_wfc = MAX( lmax_wfc, MAXVAL( upf(nt)%lchi(1:upf(nt)%nwfc) ) ) + END DO + ! + nwfcm = MAXVAL( upf(1:ntyp)%nwfc ) + npw = ngk(ik) + ! + ALLOCATE( ylm_d(npw,(lmax_wfc+1)**2), gk_d(3,npw), qg_d(npw) ) + ALLOCATE( chiq_d(npw,nwfcm,ntyp), sk_d(npw) ) + ! + xk1 = xk(1,ik) + xk2 = xk(2,ik) + xk3 = xk(3,ik) + ! + !$cuf kernel do (1) <<<*,*>>> + DO ig = 1, npw + iig = igk_k_d(ig,ik) + gk_d(1,ig) = xk1 + g_d(1,iig) + gk_d(2,ig) = xk2 + g_d(2,iig) + gk_d(3,ig) = xk3 + g_d(3,iig) + qg_d(ig) = gk_d(1,ig)**2 + gk_d(2,ig)**2 + gk_d(3,ig)**2 + END DO + ! + ! ylm = spherical harmonics + ! + CALL ylmr2_gpu( (lmax_wfc+1)**2, npw, gk_d, qg_d, ylm_d ) + ! + ! from now to the end of the routine the ig loops are distributed across bgrp + CALL divide( inter_bgrp_comm, npw, ig_start, ig_end ) + ! + ! set now q=|k+G| in atomic units + ! + !$cuf kernel do (1) <<<*,*>>> + DO ig = ig_start, ig_end + qg_d(ig) = SQRT( qg_d(ig) )*tpiba + END DO + ! + n_starting_wfc = 0 + ! + ! chiq = radial fourier transform of atomic orbitals chi + ! + CALL using_tab_at(0) + CALL using_tab_at_d(0) + ! + DO nt = 1, ntyp + DO nb = 1, upf(nt)%nwfc + IF ( upf(nt)%oc(nb) >= 0.d0 ) THEN + ! + !$cuf kernel do (1) <<<*,*>>> + DO ig = ig_start, ig_end + qgr = qg_d(ig) + px = qgr / dq - DBLE(INT(qgr/dq)) + ux = 1.d0 - px + vx = 2.d0 - px + wx = 3.d0 - px + i0 = INT(qgr/dq) + 1 + i1 = i0 + 1 + i2 = i0 + 2 + i3 = i0 + 3 + chiq_d(ig,nb,nt) = & + tab_at_d(i0,nb,nt) * ux * vx * wx / 6.d0 + & + tab_at_d(i1,nb,nt) * px * vx * wx / 2.d0 - & + tab_at_d(i2,nb,nt) * px * ux * wx / 2.d0 + & + tab_at_d(i3,nb,nt) * px * ux * vx / 6.d0 + END DO + ! + END IF + END DO + END DO + + DEALLOCATE( qg_d, gk_d ) + ! + ALLOCATE( aux(npw) ) + ! + wfcatom_d(:,:,:) = (0.0_dp, 0.0_dp) + ! + DO na = 1, nat + arg = (xk1*tau(1,na) + xk2*tau(2,na) + xk3*tau(3,na)) * tpi + kphase = CMPLX( COS(arg), - SIN(arg) ,KIND=DP) + ! + ! sk is the structure factor + ! + !$cuf kernel do (1) <<<*,*>>> + DO ig = ig_start, ig_end + iig = igk_k_d(ig,ik) + mil1 = mill_d(1,iig) + mil2 = mill_d(2,iig) + mil3 = mill_d(3,iig) + sk_d(ig) = kphase * eigts1_d(mil1,na) * & + eigts2_d(mil2,na) * & + eigts3_d(mil3,na) + END DO + ! + nt = ityp(na) + DO nb = 1, upf(nt)%nwfc + IF ( upf(nt)%oc(nb) >= 0.d0 ) THEN + l = upf(nt)%lchi(nb) + lphase = (0.d0,1.d0)**l + ! + ! the factor i^l MUST BE PRESENT in order to produce + ! wavefunctions for k=0 that are real in real space + ! + IF ( noncolin ) THEN +! ! +! IF ( upf(nt)%has_so ) THEN +! ! +! IF (starting_spin_angle.OR..NOT.domag) THEN +! CALL atomic_wfc_so( ) +! ELSE +! CALL atomic_wfc_so_mag( ) +! END IF +! ! +! ELSE +! ! +! CALL atomic_wfc_nc( ) +! ! +! END IF +! ! + ELSE + ! + CALL atomic_wfc___gpu( ) + ! + END IF + ! + END IF + ! + END DO + ! + END DO + + IF ( n_starting_wfc /= natomwfc) call errore ('atomic_wfc', & + 'internal error: some wfcs were lost ', 1 ) + + DEALLOCATE( aux ) + DEALLOCATE( sk_d, chiq_d, ylm_d ) + + ! collect results across bgrp + CALL mp_sum( wfcatom_d, inter_bgrp_comm ) + + CALL stop_clock( 'atomic_wfc' ) + RETURN + +CONTAINS +!---------------------------------------------------------------- +! SUBROUTINE atomic_wfc_so( ) +! !------------------------------------------------------------ +! !! Spin-orbit case. +! ! +! REAL(DP) :: fact(2), j +! REAL(DP), EXTERNAL :: spinor +! INTEGER :: ind, ind1, n1, is, sph_ind +! ! +! j = upf(nt)%jchi(nb) +! DO m = -l-1, l +! fact(1) = spinor(l,j,m,1) +! fact(2) = spinor(l,j,m,2) +! IF ( ABS(fact(1)) > 1.d-8 .OR. ABS(fact(2)) > 1.d-8 ) THEN +! n_starting_wfc = n_starting_wfc + 1 +! IF (n_starting_wfc > natomwfc) CALL errore & +! ('atomic_wfc_so', 'internal error: too many wfcs', 1) +! DO is=1,2 +! IF (abs(fact(is)) > 1.d-8) THEN +! ind=lmaxx+1+sph_ind(l,j,m,is) +! aux=(0.d0,0.d0) +! DO n1=1,2*l+1 +! ind1=l**2+n1 +! if (abs(rot_ylm(ind,n1)) > 1.d-8) & +! aux(:)=aux(:)+rot_ylm(ind,n1)*ylm(:,ind1) +! ENDDO +! do ig = ig_start, ig_end +! wfcatom(ig,is,n_starting_wfc) = lphase*fact(is)*& +! sk(ig)*aux(ig)*chiq (ig, nb, nt) +! END DO +! ELSE +! wfcatom(:,is,n_starting_wfc) = (0.d0,0.d0) +! END IF +! END DO +! END IF +! END DO +! ! +! END SUBROUTINE atomic_wfc_so +! ! +! SUBROUTINE atomic_wfc_so_mag( ) +! ! +! !! Spin-orbit case, magnetization along "angle1" and "angle2" +! !! In the magnetic case we always assume that magnetism is much larger +! !! than spin-orbit and average the wavefunctions at l+1/2 and l-1/2 +! !! filling then the up and down spinors with the average wavefunctions, +! !! according to the direction of the magnetization, following what is +! !! done in the noncollinear case. +! ! +! REAL(DP) :: alpha, gamman, j +! COMPLEX(DP) :: fup, fdown +! REAL(DP), ALLOCATABLE :: chiaux(:) +! INTEGER :: nc, ib +! ! +! j = upf(nt)%jchi(nb) +! ! +! ! This routine creates two functions only in the case j=l+1/2 or exit in the +! ! other case +! ! +! IF (ABS(j-l+0.5_DP)<1.d-4) RETURN +! +! ALLOCATE(chiaux(npw)) +! ! +! ! Find the functions j=l-1/2 +! ! +! IF (l == 0) THEN +! chiaux(:)=chiq(:,nb,nt) +! ELSE +! DO ib=1, upf(nt)%nwfc +! IF ((upf(nt)%lchi(ib) == l).AND. & +! (ABS(upf(nt)%jchi(ib)-l+0.5_DP)<1.d-4)) THEN +! nc=ib +! EXIT +! ENDIF +! ENDDO +! ! +! ! Average the two functions +! ! +! chiaux(:)=(chiq(:,nb,nt)*(l+1.0_DP)+chiq(:,nc,nt)*l)/(2.0_DP*l+1.0_DP) +! ! +! ENDIF +! ! +! ! and construct the starting wavefunctions as in the noncollinear case. +! ! +! alpha = angle1(nt) +! gamman = - angle2(nt) + 0.5d0*pi +! ! +! DO m = 1, 2 * l + 1 +! lm = l**2 + m +! n_starting_wfc = n_starting_wfc + 1 +! IF ( n_starting_wfc + 2*l+1 > natomwfc ) CALL errore & +! ('atomic_wfc_nc', 'internal error: too many wfcs', 1) +! DO ig = ig_start, ig_end +! aux(ig) = sk(ig)*ylm(ig,lm)*chiaux(ig) +! END DO +! ! +! ! now, rotate wfc as needed +! ! first : rotation with angle alpha around (OX) +! ! +! DO ig = ig_start, ig_end +! fup = cos(0.5d0*alpha)*aux(ig) +! fdown = (0.d0,1.d0)*sin(0.5d0*alpha)*aux(ig) +! ! +! ! Now, build the orthogonal wfc +! ! first rotation with angle (alpha+pi) around (OX) +! ! +! wfcatom(ig,1,n_starting_wfc) = (cos(0.5d0*gamman) & +! +(0.d0,1.d0)*sin(0.5d0*gamman))*fup +! wfcatom(ig,2,n_starting_wfc) = (cos(0.5d0*gamman) & +! -(0.d0,1.d0)*sin(0.5d0*gamman))*fdown +! ! +! ! second: rotation with angle gamma around (OZ) +! ! +! ! Now, build the orthogonal wfc +! ! first rotation with angle (alpha+pi) around (OX) +! ! +! fup = cos(0.5d0*(alpha+pi))*aux(ig) +! fdown = (0.d0,1.d0)*sin(0.5d0*(alpha+pi))*aux(ig) +! ! +! ! second, rotation with angle gamma around (OZ) +! ! +! wfcatom(ig,1,n_starting_wfc+2*l+1) = (cos(0.5d0*gamman) & +! +(0.d0,1.d0)*sin(0.5d0 *gamman))*fup +! wfcatom(ig,2,n_starting_wfc+2*l+1) = (cos(0.5d0*gamman) & +! -(0.d0,1.d0)*sin(0.5d0*gamman))*fdown +! END DO +! END DO +! n_starting_wfc = n_starting_wfc + 2*l+1 +! DEALLOCATE( chiaux ) +! ! +! END SUBROUTINE atomic_wfc_so_mag +! ! +! SUBROUTINE atomic_wfc_nc( ) +! ! +! !! noncolinear case, magnetization along "angle1" and "angle2" +! ! +! REAL(DP) :: alpha, gamman +! COMPLEX(DP) :: fup, fdown +! ! +! alpha = angle1(nt) +! gamman = - angle2(nt) + 0.5d0*pi +! ! +! DO m = 1, 2 * l + 1 +! lm = l**2 + m +! n_starting_wfc = n_starting_wfc + 1 +! IF ( n_starting_wfc + 2*l+1 > natomwfc) CALL errore & +! ('atomic_wfc_nc', 'internal error: too many wfcs', 1) +! DO ig = ig_start, ig_end +! aux(ig) = sk(ig)*ylm(ig,lm)*chiq(ig,nb,nt) +! END DO +! ! +! ! now, rotate wfc as needed +! ! first : rotation with angle alpha around (OX) +! ! +! DO ig = ig_start, ig_end +! fup = cos(0.5d0*alpha)*aux(ig) +! fdown = (0.d0,1.d0)*sin(0.5d0*alpha)*aux(ig) +! ! +! ! Now, build the orthogonal wfc +! ! first rotation with angle (alpha+pi) around (OX) +! ! +! wfcatom(ig,1,n_starting_wfc) = (cos(0.5d0*gamman) & +! +(0.d0,1.d0)*sin(0.5d0*gamman))*fup +! wfcatom(ig,2,n_starting_wfc) = (cos(0.5d0*gamman) & +! -(0.d0,1.d0)*sin(0.5d0*gamman))*fdown +! ! +! ! second: rotation with angle gamma around (OZ) +! ! +! ! Now, build the orthogonal wfc +! ! first rotation with angle (alpha+pi) around (OX) +! ! +! fup = cos(0.5d0*(alpha+pi))*aux(ig) +! fdown = (0.d0,1.d0)*sin(0.5d0*(alpha+pi))*aux(ig) +! ! +! ! second, rotation with angle gamma around (OZ) +! ! +! wfcatom(ig,1,n_starting_wfc+2*l+1) = (cos(0.5d0*gamman) & +! +(0.d0,1.d0)*sin(0.5d0 *gamman))*fup +! wfcatom(ig,2,n_starting_wfc+2*l+1) = (cos(0.5d0*gamman) & +! -(0.d0,1.d0)*sin(0.5d0*gamman))*fdown +! END DO +! END DO +! n_starting_wfc = n_starting_wfc + 2*l+1 +! ! +! END SUBROUTINE atomic_wfc_nc + + SUBROUTINE atomic_wfc___gpu( ) + ! + ! ... LSDA or nonmagnetic case + ! + DO m = 1, 2 * l + 1 + lm = l**2 + m + n_starting_wfc = n_starting_wfc + 1 + IF ( n_starting_wfc > natomwfc) CALL errore & + ('atomic_wfc___', 'internal error: too many wfcs', 1) + ! + !$cuf kernel do (1) <<<*,*>>> + DO ig = ig_start, ig_end + wfcatom_d(ig,1,n_starting_wfc) = lphase * & + sk_d(ig) * CMPLX(ylm_d(ig,lm) * chiq_d(ig,nb,nt)) + ENDDO + ! + END DO + ! + END SUBROUTINE atomic_wfc___gpu + ! +END SUBROUTINE atomic_wfc_gpu diff --git a/PW/src/make.gpu b/PW/src/make.gpu index c3196d501..460aa7be6 100644 --- a/PW/src/make.gpu +++ b/PW/src/make.gpu @@ -40,7 +40,8 @@ PWLIBS += \ compute_deff_gpu.o \ gen_us_dj_gpu.o \ gen_us_dy_gpu.o \ - orthoatwfc_gpu.o + orthoatwfc_gpu.o \ + atomic_wfc_gpu.o MODFLAGS += \ $(MOD_FLAG)../../GScratch diff --git a/PW/src/orthoatwfc_gpu.f90 b/PW/src/orthoatwfc_gpu.f90 index 496b31912..7b83cf459 100644 --- a/PW/src/orthoatwfc_gpu.f90 +++ b/PW/src/orthoatwfc_gpu.f90 @@ -51,12 +51,14 @@ SUBROUTINE orthoUwfc_gpu ! ibnd: counter on bands LOGICAL :: orthogonalize_wfc, normalize_only, save_flag COMPLEX(DP) , ALLOCATABLE :: wfcatom (:,:) - - !----gpu - COMPLEX(DP) , ALLOCATABLE, DEVICE :: wfcatom_d(:,:) - COMPLEX(DP) , ALLOCATABLE, DEVICE :: swfcatom_d(:,:) - !--- - + ! + COMPLEX(DP) , ALLOCATABLE :: wfcatom_d(:,:) + COMPLEX(DP) , ALLOCATABLE :: swfcatom_d(:,:) + ! +#if defined(__CUDA) + attributes(DEVICE) :: wfcatom_d, swfcatom_d +#endif + ! IF ( U_projection == "pseudo" ) THEN WRITE( stdout,*) 'Beta functions used for LDA+U Projector' RETURN @@ -90,14 +92,11 @@ SUBROUTINE orthoUwfc_gpu WRITE( stdout,*) "U_projection_type =", U_projection CALL errore ("orthoatwfc"," this U_projection_type is not valid",1) END IF - + ! ALLOCATE( wfcatom(npwx*npol,natomwfc), swfcatom(npwx*npol,natomwfc) ) - - !-----gpu - ALLOCATE( wfcatom_d(npwx*npol,natomwfc), swfcatom_d(npwx*npol,natomwfc) ) !.. - !---- + ALLOCATE( wfcatom_d(npwx*npol,natomwfc), swfcatom_d(npwx*npol,natomwfc) ) ! save_flag = use_bgrp_in_hpsi ; use_bgrp_in_hpsi=.false. - + ! ! Allocate the array becp = CALL allocate_bec_type( nkb, natomwfc, becp ) CALL using_becp_auto(2) @@ -106,19 +105,16 @@ SUBROUTINE orthoUwfc_gpu DO ik = 1, nks ! IF (noncolin) THEN - CALL atomic_wfc_nc_updown (ik, wfcatom) + CALL atomic_wfc_nc_updown( ik, wfcatom ) ELSE - CALL atomic_wfc (ik, wfcatom) ! --> da fare in gpu dopo + CALL atomic_wfc_gpu( ik, wfcatom_d ) ENDIF - !--- - wfcatom_d = wfcatom - !---- - + ! npw = ngk(ik) !CALL using_vkb(1) CALL using_vkb_d(2) CALL init_us_2_gpu( npw, igk_k_d(1,ik), xk(1,ik), vkb_d ) - + ! CALL using_becp_d_auto(2) CALL calbec_gpu( npw, vkb_d, wfcatom_d, becp_d ) ! @@ -127,14 +123,13 @@ SUBROUTINE orthoUwfc_gpu IF (orthogonalize_wfc) CALL ortho_swfc_gpu( npw, normalize_only, & natomwfc, wfcatom_d, swfcatom_d, .TRUE. ) ! - wfcatom = wfcatom_d - swfcatom = swfcatom_d - ! - ! ! copy atomic wavefunctions with Hubbard U term only in wfcU ! (this is then used to compute Hubbard forces and stresses) ! save to unit iunhub2 ! + wfcatom = wfcatom_d + swfcatom = swfcatom_d + ! CALL copy_U_wfc( wfcatom, noncolin ) CALL save_buffer( wfcU, nwordwfcU, iunhub2, ik ) ! @@ -142,14 +137,18 @@ SUBROUTINE orthoUwfc_gpu ! (this is used during the self-consistent solution of Kohn-Sham equations) ! save to unit iunhub ! - CALL copy_U_wfc (swfcatom, noncolin) + CALL copy_U_wfc( swfcatom, noncolin ) IF ( nks > 1 ) & - CALL save_buffer (wfcU, nwordwfcU, iunhub, ik) + CALL save_buffer( wfcU, nwordwfcU, iunhub, ik ) ! ENDDO - DEALLOCATE (wfcatom, swfcatom) - CALL deallocate_bec_type ( becp ) + ! + DEALLOCATE( wfcatom, swfcatom ) + DEALLOCATE( wfcatom_d, swfcatom_d ) + ! + CALL deallocate_bec_type( becp ) CALL using_becp_auto(2) + ! use_bgrp_in_hpsi = save_flag ! RETURN From 74c992fe7589b6a0cc0f0ad40b9cffc2ae3ea066 Mon Sep 17 00:00:00 2001 From: fabrizio22 Date: Fri, 22 May 2020 12:18:35 +0200 Subject: [PATCH 5/7] atomic_wfc_gpu complete --- PW/src/atomic_wfc_gpu.f90 | 440 ++++++++++++++++++++------------------ 1 file changed, 234 insertions(+), 206 deletions(-) diff --git a/PW/src/atomic_wfc_gpu.f90 b/PW/src/atomic_wfc_gpu.f90 index bda7b7875..097fb2963 100644 --- a/PW/src/atomic_wfc_gpu.f90 +++ b/PW/src/atomic_wfc_gpu.f90 @@ -42,7 +42,6 @@ SUBROUTINE atomic_wfc_gpu( ik, wfcatom_d ) ! INTEGER :: n_starting_wfc, lmax_wfc, nt, l, nb, na, m, lm, ig, iig, & i0, i1, i2, i3, nwfcm, npw - COMPLEX(DP), ALLOCATABLE :: aux(:) COMPLEX(DP) :: kphase, lphase REAL(DP) :: arg, px, ux, vx, wx INTEGER :: ig_start, ig_end, mil1, mil2, mil3 @@ -50,10 +49,10 @@ SUBROUTINE atomic_wfc_gpu( ik, wfcatom_d ) REAL(DP) :: xk1, xk2, xk3, qgr REAL(DP), ALLOCATABLE :: chiq_d(:,:,:) REAL(DP), ALLOCATABLE :: ylm_d(:,:), gk_d(:,:), qg_d(:) - COMPLEX(DP), ALLOCATABLE :: sk_d(:) + COMPLEX(DP), ALLOCATABLE :: sk_d(:), aux_d(:) ! #if defined(__CUDA) - attributes(DEVICE) :: wfcatom_d, ylm_d, gk_d, qg_d, sk_d, chiq_d + attributes(DEVICE) :: wfcatom_d, ylm_d, gk_d, qg_d, sk_d, chiq_d, aux_d #endif ! CALL start_clock( 'atomic_wfc' ) @@ -132,7 +131,7 @@ SUBROUTINE atomic_wfc_gpu( ik, wfcatom_d ) DEALLOCATE( qg_d, gk_d ) ! - ALLOCATE( aux(npw) ) + IF (noncolin) ALLOCATE( aux_d(npw) ) ! wfcatom_d(:,:,:) = (0.0_dp, 0.0_dp) ! @@ -163,21 +162,21 @@ SUBROUTINE atomic_wfc_gpu( ik, wfcatom_d ) ! wavefunctions for k=0 that are real in real space ! IF ( noncolin ) THEN -! ! -! IF ( upf(nt)%has_so ) THEN -! ! -! IF (starting_spin_angle.OR..NOT.domag) THEN -! CALL atomic_wfc_so( ) -! ELSE -! CALL atomic_wfc_so_mag( ) -! END IF -! ! -! ELSE -! ! -! CALL atomic_wfc_nc( ) -! ! -! END IF -! ! + ! + IF ( upf(nt)%has_so ) THEN + ! + IF (starting_spin_angle.OR..NOT.domag) THEN + CALL atomic_wfc_so_gpu( ) + ELSE + CALL atomic_wfc_so_mag_gpu( ) + END IF + ! + ELSE + ! + CALL atomic_wfc_nc_gpu( ) + ! + END IF + ! ELSE ! CALL atomic_wfc___gpu( ) @@ -193,7 +192,7 @@ SUBROUTINE atomic_wfc_gpu( ik, wfcatom_d ) IF ( n_starting_wfc /= natomwfc) call errore ('atomic_wfc', & 'internal error: some wfcs were lost ', 1 ) - DEALLOCATE( aux ) + IF (noncolin) DEALLOCATE( aux_d ) DEALLOCATE( sk_d, chiq_d, ylm_d ) ! collect results across bgrp @@ -204,191 +203,220 @@ SUBROUTINE atomic_wfc_gpu( ik, wfcatom_d ) CONTAINS !---------------------------------------------------------------- -! SUBROUTINE atomic_wfc_so( ) -! !------------------------------------------------------------ -! !! Spin-orbit case. -! ! -! REAL(DP) :: fact(2), j -! REAL(DP), EXTERNAL :: spinor -! INTEGER :: ind, ind1, n1, is, sph_ind -! ! -! j = upf(nt)%jchi(nb) -! DO m = -l-1, l -! fact(1) = spinor(l,j,m,1) -! fact(2) = spinor(l,j,m,2) -! IF ( ABS(fact(1)) > 1.d-8 .OR. ABS(fact(2)) > 1.d-8 ) THEN -! n_starting_wfc = n_starting_wfc + 1 -! IF (n_starting_wfc > natomwfc) CALL errore & -! ('atomic_wfc_so', 'internal error: too many wfcs', 1) -! DO is=1,2 -! IF (abs(fact(is)) > 1.d-8) THEN -! ind=lmaxx+1+sph_ind(l,j,m,is) -! aux=(0.d0,0.d0) -! DO n1=1,2*l+1 -! ind1=l**2+n1 -! if (abs(rot_ylm(ind,n1)) > 1.d-8) & -! aux(:)=aux(:)+rot_ylm(ind,n1)*ylm(:,ind1) -! ENDDO -! do ig = ig_start, ig_end -! wfcatom(ig,is,n_starting_wfc) = lphase*fact(is)*& -! sk(ig)*aux(ig)*chiq (ig, nb, nt) -! END DO -! ELSE -! wfcatom(:,is,n_starting_wfc) = (0.d0,0.d0) -! END IF -! END DO -! END IF -! END DO -! ! -! END SUBROUTINE atomic_wfc_so -! ! -! SUBROUTINE atomic_wfc_so_mag( ) -! ! -! !! Spin-orbit case, magnetization along "angle1" and "angle2" -! !! In the magnetic case we always assume that magnetism is much larger -! !! than spin-orbit and average the wavefunctions at l+1/2 and l-1/2 -! !! filling then the up and down spinors with the average wavefunctions, -! !! according to the direction of the magnetization, following what is -! !! done in the noncollinear case. -! ! -! REAL(DP) :: alpha, gamman, j -! COMPLEX(DP) :: fup, fdown -! REAL(DP), ALLOCATABLE :: chiaux(:) -! INTEGER :: nc, ib -! ! -! j = upf(nt)%jchi(nb) -! ! -! ! This routine creates two functions only in the case j=l+1/2 or exit in the -! ! other case -! ! -! IF (ABS(j-l+0.5_DP)<1.d-4) RETURN -! -! ALLOCATE(chiaux(npw)) -! ! -! ! Find the functions j=l-1/2 -! ! -! IF (l == 0) THEN -! chiaux(:)=chiq(:,nb,nt) -! ELSE -! DO ib=1, upf(nt)%nwfc -! IF ((upf(nt)%lchi(ib) == l).AND. & -! (ABS(upf(nt)%jchi(ib)-l+0.5_DP)<1.d-4)) THEN -! nc=ib -! EXIT -! ENDIF -! ENDDO -! ! -! ! Average the two functions -! ! -! chiaux(:)=(chiq(:,nb,nt)*(l+1.0_DP)+chiq(:,nc,nt)*l)/(2.0_DP*l+1.0_DP) -! ! -! ENDIF -! ! -! ! and construct the starting wavefunctions as in the noncollinear case. -! ! -! alpha = angle1(nt) -! gamman = - angle2(nt) + 0.5d0*pi -! ! -! DO m = 1, 2 * l + 1 -! lm = l**2 + m -! n_starting_wfc = n_starting_wfc + 1 -! IF ( n_starting_wfc + 2*l+1 > natomwfc ) CALL errore & -! ('atomic_wfc_nc', 'internal error: too many wfcs', 1) -! DO ig = ig_start, ig_end -! aux(ig) = sk(ig)*ylm(ig,lm)*chiaux(ig) -! END DO -! ! -! ! now, rotate wfc as needed -! ! first : rotation with angle alpha around (OX) -! ! -! DO ig = ig_start, ig_end -! fup = cos(0.5d0*alpha)*aux(ig) -! fdown = (0.d0,1.d0)*sin(0.5d0*alpha)*aux(ig) -! ! -! ! Now, build the orthogonal wfc -! ! first rotation with angle (alpha+pi) around (OX) -! ! -! wfcatom(ig,1,n_starting_wfc) = (cos(0.5d0*gamman) & -! +(0.d0,1.d0)*sin(0.5d0*gamman))*fup -! wfcatom(ig,2,n_starting_wfc) = (cos(0.5d0*gamman) & -! -(0.d0,1.d0)*sin(0.5d0*gamman))*fdown -! ! -! ! second: rotation with angle gamma around (OZ) -! ! -! ! Now, build the orthogonal wfc -! ! first rotation with angle (alpha+pi) around (OX) -! ! -! fup = cos(0.5d0*(alpha+pi))*aux(ig) -! fdown = (0.d0,1.d0)*sin(0.5d0*(alpha+pi))*aux(ig) -! ! -! ! second, rotation with angle gamma around (OZ) -! ! -! wfcatom(ig,1,n_starting_wfc+2*l+1) = (cos(0.5d0*gamman) & -! +(0.d0,1.d0)*sin(0.5d0 *gamman))*fup -! wfcatom(ig,2,n_starting_wfc+2*l+1) = (cos(0.5d0*gamman) & -! -(0.d0,1.d0)*sin(0.5d0*gamman))*fdown -! END DO -! END DO -! n_starting_wfc = n_starting_wfc + 2*l+1 -! DEALLOCATE( chiaux ) -! ! -! END SUBROUTINE atomic_wfc_so_mag -! ! -! SUBROUTINE atomic_wfc_nc( ) -! ! -! !! noncolinear case, magnetization along "angle1" and "angle2" -! ! -! REAL(DP) :: alpha, gamman -! COMPLEX(DP) :: fup, fdown -! ! -! alpha = angle1(nt) -! gamman = - angle2(nt) + 0.5d0*pi -! ! -! DO m = 1, 2 * l + 1 -! lm = l**2 + m -! n_starting_wfc = n_starting_wfc + 1 -! IF ( n_starting_wfc + 2*l+1 > natomwfc) CALL errore & -! ('atomic_wfc_nc', 'internal error: too many wfcs', 1) -! DO ig = ig_start, ig_end -! aux(ig) = sk(ig)*ylm(ig,lm)*chiq(ig,nb,nt) -! END DO -! ! -! ! now, rotate wfc as needed -! ! first : rotation with angle alpha around (OX) -! ! -! DO ig = ig_start, ig_end -! fup = cos(0.5d0*alpha)*aux(ig) -! fdown = (0.d0,1.d0)*sin(0.5d0*alpha)*aux(ig) -! ! -! ! Now, build the orthogonal wfc -! ! first rotation with angle (alpha+pi) around (OX) -! ! -! wfcatom(ig,1,n_starting_wfc) = (cos(0.5d0*gamman) & -! +(0.d0,1.d0)*sin(0.5d0*gamman))*fup -! wfcatom(ig,2,n_starting_wfc) = (cos(0.5d0*gamman) & -! -(0.d0,1.d0)*sin(0.5d0*gamman))*fdown -! ! -! ! second: rotation with angle gamma around (OZ) -! ! -! ! Now, build the orthogonal wfc -! ! first rotation with angle (alpha+pi) around (OX) -! ! -! fup = cos(0.5d0*(alpha+pi))*aux(ig) -! fdown = (0.d0,1.d0)*sin(0.5d0*(alpha+pi))*aux(ig) -! ! -! ! second, rotation with angle gamma around (OZ) -! ! -! wfcatom(ig,1,n_starting_wfc+2*l+1) = (cos(0.5d0*gamman) & -! +(0.d0,1.d0)*sin(0.5d0 *gamman))*fup -! wfcatom(ig,2,n_starting_wfc+2*l+1) = (cos(0.5d0*gamman) & -! -(0.d0,1.d0)*sin(0.5d0*gamman))*fdown -! END DO -! END DO -! n_starting_wfc = n_starting_wfc + 2*l+1 -! ! -! END SUBROUTINE atomic_wfc_nc + SUBROUTINE atomic_wfc_so_gpu( ) + !------------------------------------------------------------ + !! Spin-orbit case. + ! + REAL(DP) :: fact(2), fact_is, j + COMPLEX(DP) :: rot_ylm_in1 + REAL(DP), EXTERNAL :: spinor + INTEGER :: ind, ind1, n1, is, sph_ind + ! + j = upf(nt)%jchi(nb) + DO m = -l-1, l + fact(1) = spinor(l,j,m,1) + fact(2) = spinor(l,j,m,2) + IF ( ABS(fact(1)) > 1.d-8 .OR. ABS(fact(2)) > 1.d-8 ) THEN + n_starting_wfc = n_starting_wfc + 1 + IF (n_starting_wfc > natomwfc) CALL errore & + ('atomic_wfc_so', 'internal error: too many wfcs', 1) + ! + DO is = 1, 2 + fact_is = fact(is) + IF (ABS(fact(is)) > 1.d-8) THEN + ind = lmaxx + 1 + sph_ind(l,j,m,is) + aux_d = (0.d0,0.d0) + DO n1 = 1, 2*l+1 + ind1 = l**2+n1 + rot_ylm_in1 = rot_ylm(ind,n1) + IF (ABS(rot_ylm_in1) > 1.d-8) THEN + !$cuf kernel do (1) <<<*,*>>> + DO ig = ig_start, ig_end + aux_d(ig) = aux_d(ig) + rot_ylm_in1 * & + CMPLX(ylm_d(ig,ind1)) + ENDDO + ENDIF + ENDDO + !$cuf kernel do (1) <<<*,*>>> + DO ig = ig_start, ig_end + wfcatom_d(ig,is,n_starting_wfc) = lphase * & + sk_d(ig)*aux_d(ig)*CMPLX(fact_is* & + chiq_d(ig,nb,nt)) + END DO + ELSE + wfcatom_d(:,is,n_starting_wfc) = (0.d0,0.d0) + END IF + END DO + ! + END IF + END DO + ! + END SUBROUTINE atomic_wfc_so_gpu + ! + SUBROUTINE atomic_wfc_so_mag_gpu( ) + ! + !! Spin-orbit case, magnetization along "angle1" and "angle2" + !! In the magnetic case we always assume that magnetism is much larger + !! than spin-orbit and average the wavefunctions at l+1/2 and l-1/2 + !! filling then the up and down spinors with the average wavefunctions, + !! according to the direction of the magnetization, following what is + !! done in the noncollinear case. + ! + REAL(DP) :: alpha, gamman, j + COMPLEX(DP) :: fup, fdown + REAL(DP), ALLOCATABLE :: chiaux_d(:) + INTEGER :: nc, ib + ! +#if defined(__CUDA) + ATTRIBUTES(DEVICE) :: chiaux_d +#endif + ! + j = upf(nt)%jchi(nb) + ! + ! This routine creates two functions only in the case j=l+1/2 or exit in the + ! other case + ! + IF (ABS(j-l+0.5_DP)<1.d-4) RETURN - SUBROUTINE atomic_wfc___gpu( ) + ALLOCATE( chiaux_d(npw) ) + ! + ! Find the functions j=l-1/2 + ! + IF (l == 0) THEN + chiaux_d(:) = chiq_d(:,nb,nt) + ELSE + DO ib = 1, upf(nt)%nwfc + IF ((upf(nt)%lchi(ib) == l).AND. & + (ABS(upf(nt)%jchi(ib)-l+0.5_DP)<1.d-4)) THEN + nc = ib + EXIT + ENDIF + ENDDO + ! + ! Average the two functions + ! + !$cuf kernel do (1) <<<*,*>>> + DO ig = 1, npw + chiaux_d(ig) = (chiq_d(ig,nb,nt)*DBLE(l+1)+chiq_d(ig,nc,nt)*l)/ & + DBLE(2*l+1) + ENDDO + ! + ENDIF + ! + ! and construct the starting wavefunctions as in the noncollinear case. + ! + alpha = angle1(nt) + gamman = - angle2(nt) + 0.5d0*pi + ! + DO m = 1, 2*l+1 + lm = l**2+m + n_starting_wfc = n_starting_wfc + 1 + IF ( n_starting_wfc + 2*l+1 > natomwfc ) CALL errore & + ('atomic_wfc_nc', 'internal error: too many wfcs', 1) + ! + !$cuf kernel do (1) <<<*,*>>> + DO ig = ig_start, ig_end + aux_d(ig) = sk_d(ig)* CMPLX(ylm_d(ig,lm)*chiaux_d(ig)) + END DO + ! + ! now, rotate wfc as needed + ! first : rotation with angle alpha around (OX) + ! + !$cuf kernel do (1) <<<*,*>>> + DO ig = ig_start, ig_end + fup = CMPLX(COS(0.5d0*alpha))*aux_d(ig) + fdown = (0.d0,1.d0)*CMPLX(SIN(0.5d0*alpha))*aux_d(ig) + ! + ! Now, build the orthogonal wfc + ! first rotation with angle (alpha+pi) around (OX) + ! + wfcatom_d(ig,1,n_starting_wfc) = (CMPLX(COS(0.5d0*gamman)) & + +(0.d0,1.d0)*CMPLX(SIN(0.5d0*gamman)))*fup + wfcatom_d(ig,2,n_starting_wfc) = (CMPLX(COS(0.5d0*gamman)) & + -(0.d0,1.d0)*CMPLX(SIN(0.5d0*gamman)))*fdown + ! + ! second: rotation with angle gamma around (OZ) + ! + ! Now, build the orthogonal wfc + ! first rotation with angle (alpha+pi) around (OX) + ! + fup = CMPLX(COS(0.5d0*(alpha+pi)))*aux_d(ig) + fdown = (0.d0,1.d0)*CMPLX(SIN(0.5d0*(alpha+pi)))*aux_d(ig) + ! + ! second, rotation with angle gamma around (OZ) + ! + wfcatom_d(ig,1,n_starting_wfc+2*l+1) = (CMPLX(COS(0.5d0*gamman)) & + +(0.d0,1.d0)*CMPLX(SIN(0.5d0 *gamman)))*fup + wfcatom_d(ig,2,n_starting_wfc+2*l+1) = (CMPLX(COS(0.5d0*gamman)) & + -(0.d0,1.d0)*CMPLX(SIN(0.5d0*gamman)))*fdown + END DO + END DO + ! + n_starting_wfc = n_starting_wfc + 2*l+1 + ! + DEALLOCATE( chiaux_d ) + ! + END SUBROUTINE atomic_wfc_so_mag_gpu + ! + ! + SUBROUTINE atomic_wfc_nc_gpu( ) + ! + !! noncolinear case, magnetization along "angle1" and "angle2" + ! + REAL(DP) :: alpha, gamman + COMPLEX(DP) :: fup, fdown + ! + alpha = angle1(nt) + gamman = - angle2(nt) + 0.5d0*pi + ! + DO m = 1, 2*l+1 + lm = l**2 + m + n_starting_wfc = n_starting_wfc + 1 + IF ( n_starting_wfc + 2*l+1 > natomwfc) CALL errore & + ('atomic_wfc_nc', 'internal error: too many wfcs', 1) + !$cuf kernel do (1) <<<*,*>>> + DO ig = ig_start, ig_end + aux_d(ig) = sk_d(ig)*CMPLX(ylm_d(ig,lm)*chiq_d(ig,nb,nt)) + END DO + ! + ! now, rotate wfc as needed + ! first : rotation with angle alpha around (OX) + ! + !$cuf kernel do (1) <<<*,*>>> + DO ig = ig_start, ig_end + fup = CMPLX(COS(0.5d0*alpha))*aux_d(ig) + fdown = (0.d0,1.d0)*CMPLX(SIN(0.5d0*alpha))*aux_d(ig) + ! + ! Now, build the orthogonal wfc + ! first rotation with angle (alpha+pi) around (OX) + ! + wfcatom_d(ig,1,n_starting_wfc) = (CMPLX(COS(0.5d0*gamman)) & + +(0.d0,1.d0)*CMPLX(SIN(0.5d0*gamman)))*fup + wfcatom_d(ig,2,n_starting_wfc) = (CMPLX(COS(0.5d0*gamman)) & + -(0.d0,1.d0)*CMPLX(SIN(0.5d0*gamman)))*fdown + ! + ! second: rotation with angle gamma around (OZ) + ! + ! Now, build the orthogonal wfc + ! first rotation with angle (alpha+pi) around (OX) + ! + fup = CMPLX(COS(0.5d0*(alpha+pi)))*aux_d(ig) + fdown = (0.d0,1.d0)*CMPLX(SIN(0.5d0*(alpha+pi)))*aux_d(ig) + ! + ! second, rotation with angle gamma around (OZ) + ! + wfcatom_d(ig,1,n_starting_wfc+2*l+1) = (CMPLX(COS(0.5d0*gamman)) & + +(0.d0,1.d0)*CMPLX(SIN(0.5d0*gamman)))*fup + wfcatom_d(ig,2,n_starting_wfc+2*l+1) = (CMPLX(COS(0.5d0*gamman)) & + -(0.d0,1.d0)*CMPLX(SIN(0.5d0*gamman)))*fdown + END DO + END DO + n_starting_wfc = n_starting_wfc + 2*l+1 + ! + END SUBROUTINE atomic_wfc_nc_gpu + ! + ! + SUBROUTINE atomic_wfc___gpu( ) ! ! ... LSDA or nonmagnetic case ! @@ -406,6 +434,6 @@ CONTAINS ! END DO ! - END SUBROUTINE atomic_wfc___gpu - ! + END SUBROUTINE atomic_wfc___gpu + ! END SUBROUTINE atomic_wfc_gpu From caf15cc6a9c36b0a266c5f83124295c3746617d0 Mon Sep 17 00:00:00 2001 From: fabrizio22 Date: Fri, 22 May 2020 13:22:17 +0200 Subject: [PATCH 6/7] orthoatwfc_gpu --- PW/src/hinit1.f90 | 12 +++++-- PW/src/orthoatwfc_gpu.f90 | 70 +++++++++++++++++++++++++-------------- PW/src/wfcinit_gpu.f90 | 2 +- 3 files changed, 55 insertions(+), 29 deletions(-) diff --git a/PW/src/hinit1.f90 b/PW/src/hinit1.f90 index 3a45778e9..70d052a5c 100644 --- a/PW/src/hinit1.f90 +++ b/PW/src/hinit1.f90 @@ -21,7 +21,7 @@ SUBROUTINE hinit1() USE lsda_mod, ONLY : nspin USE noncollin_module, ONLY : report USE scf, ONLY : vrs, vltot, v, kedtau - USE control_flags, ONLY : tqr + USE control_flags, ONLY : tqr, use_gpu USE realus, ONLY : generate_qpointlist, betapointlist, & init_realspace_vars, real_space USE wannier_new, ONLY : use_wannier @@ -33,6 +33,7 @@ SUBROUTINE hinit1() USE dfunct, ONLY : newd ! USE scf_gpum, ONLY : using_vrs + ! IMPLICIT NONE ! @@ -82,8 +83,13 @@ SUBROUTINE hinit1() ! ... and recalculate the products of the S with the atomic wfcs used ! ... in LDA+U calculations ! - IF ( lda_plus_u ) CALL orthoUwfc() - IF ( use_wannier ) CALL orthoatwfc( .TRUE. ) + IF (.NOT. use_gpu) THEN + IF ( lda_plus_u ) CALL orthoUwfc() + IF ( use_wannier ) CALL orthoatwfc( .TRUE. ) + ELSE + IF ( lda_plus_u ) CALL orthoUwfc_gpu() + IF ( use_wannier ) CALL orthoatwfc_gpu( .TRUE. ) + ENDIF ! ! RETURN diff --git a/PW/src/orthoatwfc_gpu.f90 b/PW/src/orthoatwfc_gpu.f90 index 7b83cf459..a3be5aa63 100644 --- a/PW/src/orthoatwfc_gpu.f90 +++ b/PW/src/orthoatwfc_gpu.f90 @@ -106,6 +106,7 @@ SUBROUTINE orthoUwfc_gpu ! IF (noncolin) THEN CALL atomic_wfc_nc_updown( ik, wfcatom ) + wfcatom_d = wfcatom ELSE CALL atomic_wfc_gpu( ik, wfcatom_d ) ENDIF @@ -170,7 +171,7 @@ SUBROUTINE orthoatwfc_gpu( orthogonalize_wfc ) USE io_files, ONLY : iunsat, nwordatwfc USE ions_base, ONLY : nat USE basis, ONLY : natomwfc, swfcatom - USE klist, ONLY : nks, xk, ngk, igk_k + USE klist, ONLY : nks, xk, ngk, igk_k, igk_k_d USE wvfct, ONLY : npwx USE uspp, ONLY : nkb, vkb USE becmod, ONLY : allocate_bec_type, deallocate_bec_type, & @@ -178,51 +179,70 @@ SUBROUTINE orthoatwfc_gpu( orthogonalize_wfc ) USE control_flags, ONLY : gamma_only USE noncollin_module, ONLY : noncolin, npol ! - USE uspp_gpum, ONLY : using_vkb + USE uspp_gpum, ONLY : using_vkb, using_vkb_d, vkb_d + USE becmod_gpum, ONLY : becp_d + USE becmod_subs_gpum, ONLY : using_becp_auto, using_becp_d_auto, & + calbec_gpu ! IMPLICIT NONE ! - LOGICAL, INTENT(in) :: orthogonalize_wfc + LOGICAL, INTENT(IN) :: orthogonalize_wfc ! INTEGER :: ik, ibnd, info, i, j, k, na, nb, nt, isym, n, ntemp, m, & - l, lm, ltot, ntot, ipol, npw + l, lm, ltot, ntot, ipol, npw ! ik: the k point under consideration ! ibnd: counter on bands LOGICAL :: normalize_only = .FALSE. - COMPLEX(DP) , ALLOCATABLE :: wfcatom (:,:) - + COMPLEX(DP), ALLOCATABLE :: wfcatom(:,:) + COMPLEX(DP), ALLOCATABLE :: wfcatom_d(:,:), swfcatom_d(:,:) + ! +#if defined(__CUDA) + attributes(DEVICE) :: wfcatom_d, swfcatom_d +#endif + ! normalize_only=.FALSE. - ALLOCATE (wfcatom( npwx*npol, natomwfc)) - + ALLOCATE( wfcatom(npwx*npol,natomwfc) ) + ALLOCATE( wfcatom_d(npwx*npol,natomwfc), swfcatom_d(npwx*npol,natomwfc) ) + ! ! Allocate the array becp = - CALL allocate_bec_type (nkb,natomwfc, becp) - + CALL allocate_bec_type( nkb, natomwfc, becp ) + ! DO ik = 1, nks - + ! IF (noncolin) THEN - CALL atomic_wfc_nc_updown (ik, wfcatom) + CALL atomic_wfc_nc_updown( ik, wfcatom ) + wfcatom_d = wfcatom ELSE - CALL atomic_wfc (ik, wfcatom) + CALL atomic_wfc_gpu( ik, wfcatom_d ) ENDIF - npw = ngk (ik) - CALL using_vkb(1) - CALL init_us_2 (npw, igk_k(1,ik), xk (1, ik), vkb) - CALL calbec (npw, vkb, wfcatom, becp) - CALL s_psi (npwx, npw, natomwfc, wfcatom, swfcatom) - - IF (orthogonalize_wfc) & - CALL ortho_swfc ( npw, normalize_only, natomwfc, wfcatom, swfcatom, .FALSE. ) + ! + npw = ngk(ik) + !CALL using_vkb(1) + CALL using_vkb_d(2) + CALL init_us_2_gpu( npw, igk_k_d(1,ik), xk(1,ik), vkb_d ) + ! + CALL using_becp_d_auto(2) + CALL calbec_gpu( npw, vkb_d, wfcatom_d, becp_d ) + ! + CALL s_psi_gpu( npwx, npw, natomwfc, wfcatom_d, swfcatom_d ) + ! + IF (orthogonalize_wfc) CALL ortho_swfc_gpu( npw, normalize_only, & + natomwfc, wfcatom_d, swfcatom_d, .FALSE. ) ! ! write S * atomic wfc to unit iunsat ! - CALL save_buffer (swfcatom, nwordatwfc, iunsat, ik) + swfcatom = swfcatom_d + ! + CALL save_buffer( swfcatom, nwordatwfc, iunsat, ik ) ! ENDDO - DEALLOCATE (wfcatom) - CALL deallocate_bec_type ( becp ) + ! + DEALLOCATE( wfcatom ) + DEALLOCATE( wfcatom_d, swfcatom_d ) + CALL deallocate_bec_type( becp ) ! RETURN - + ! END SUBROUTINE orthoatwfc_gpu ! !----------------------------------------------------------------------- diff --git a/PW/src/wfcinit_gpu.f90 b/PW/src/wfcinit_gpu.f90 index a12780c8e..8ea874d8e 100644 --- a/PW/src/wfcinit_gpu.f90 +++ b/PW/src/wfcinit_gpu.f90 @@ -51,7 +51,7 @@ SUBROUTINE wfcinit_gpu() ! ! ... Orthogonalized atomic functions needed for DFT+U and other cases ! - IF ( use_wannier .OR. one_atom_occupations ) CALL orthoatwfc ( use_wannier ) + IF ( use_wannier .OR. one_atom_occupations ) CALL orthoatwfc_gpu( use_wannier ) IF ( lda_plus_u ) CALL orthoUwfc_gpu() ! ! ... open files/buffer for wavefunctions (nwordwfc set in openfil) From 7daf0ea809ec8944579eb9d63c694c66cd9637ae Mon Sep 17 00:00:00 2001 From: fabrizio22 Date: Thu, 28 May 2020 11:39:30 +0200 Subject: [PATCH 7/7] orthoatwfc_gpu small fix --- PW/src/orthoatwfc_gpu.f90 | 3 +++ 1 file changed, 3 insertions(+) diff --git a/PW/src/orthoatwfc_gpu.f90 b/PW/src/orthoatwfc_gpu.f90 index a3be5aa63..4fc7c374b 100644 --- a/PW/src/orthoatwfc_gpu.f90 +++ b/PW/src/orthoatwfc_gpu.f90 @@ -207,6 +207,7 @@ SUBROUTINE orthoatwfc_gpu( orthogonalize_wfc ) ! Allocate the array becp = CALL allocate_bec_type( nkb, natomwfc, becp ) ! + DO ik = 1, nks ! IF (noncolin) THEN @@ -221,6 +222,7 @@ SUBROUTINE orthoatwfc_gpu( orthogonalize_wfc ) CALL using_vkb_d(2) CALL init_us_2_gpu( npw, igk_k_d(1,ik), xk(1,ik), vkb_d ) ! + CALL using_becp_auto(2) CALL using_becp_d_auto(2) CALL calbec_gpu( npw, vkb_d, wfcatom_d, becp_d ) ! @@ -301,6 +303,7 @@ SUBROUTINE ortho_swfc_gpu( npw, normalize_only, m, wfc_d, swfc_d, lflag ) work_d(:,:) = (0.d0,0.d0) ! ! calculate overlap matrix + ! IF (noncolin) THEN CALL cublasZgemm( 'C', 'N', m, m, npwx*npol, (1.d0,0.d0), wfc_d, &