From 2e4ee8f50a44aaef4d65f7a994d83fed9436aee9 Mon Sep 17 00:00:00 2001 From: afonari Date: Sat, 17 Aug 2024 18:53:42 +0000 Subject: [PATCH 1/7] Update matdyn.f90. Add q-point weight to the .freq output --- PHonon/PH/matdyn.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PHonon/PH/matdyn.f90 b/PHonon/PH/matdyn.f90 index 320c8e0d8..6aab39775 100644 --- a/PHonon/PH/matdyn.f90 +++ b/PHonon/PH/matdyn.f90 @@ -762,7 +762,7 @@ PROGRAM matdyn OPEN (unit=2,file=flfrq ,status='unknown',form='formatted') WRITE(2, '(" &plot nbnd=",i4,", nks=",i4," /")') 3*nat, nq DO n=1, nq - WRITE(2, '(10x,3f10.6)') q(1,n), q(2,n), q(3,n) + WRITE(2, '(10x,4f10.6)') q(1,n), q(2,n), q(3,n), wq(n) WRITE(2,'(6f10.4)') (freq(i,n), i=1,3*nat) END DO CLOSE(unit=2) From 7e7bbae0ea00bdcc8ffff3e77ffa2c2db8de3690 Mon Sep 17 00:00:00 2001 From: Ivan Carnimeo Date: Mon, 19 Aug 2024 12:14:01 +0200 Subject: [PATCH 2/7] DFT-D3 documentation --- PP/Doc/INPUT_D3HESS.def | 2 ++ dft-d3/core.f90 | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/PP/Doc/INPUT_D3HESS.def b/PP/Doc/INPUT_D3HESS.def index 096674612..abcfd6615 100644 --- a/PP/Doc/INPUT_D3HESS.def +++ b/PP/Doc/INPUT_D3HESS.def @@ -19,6 +19,8 @@ input_description -distribution {Quantum ESPRESSO} -package PWscf -program d3hes (3) run phonon Please note that filhess in d3hess input and dftd3_hess in phonon input, if given, should match. + Please also note that second derivatives of the three-body term of d3 dispersion are not implemented, + and phonon calculations with d3 should be run with dftd3_threebody=.false. in the SCF. @b {Structure of the input data:} ============================ diff --git a/dft-d3/core.f90 b/dft-d3/core.f90 index 020e0f589..e055cb3b1 100644 --- a/dft-d3/core.f90 +++ b/dft-d3/core.f90 @@ -4028,7 +4028,8 @@ contains real(wp) :: res1, res2, gnorm_supercell if(num) Call errore('pbcgdisp', 'Atom displacement not implemented with numerical forces', 1) - if(.not.noabc) Call errore('pbcgdisp', 'Atom displacement not implemented with the threebody term', 1) + if(.not.noabc) Call errore('pbcgdisp', 'Atom displacement not implemented with the threebody term ' // & + ' (set dftd3_threebody=.false. for phonon calculations)', 1) ns = shape(g_supercell_) g_supercell( -ns(1)/2:ns(1)/2, -ns(2)/2:ns(2)/2, -ns(3)/2:ns(3)/2, 1:ns(4), 1:ns(5) ) => g_supercell_ From 27cf6103f7a73eb050fbf87ffb13d5923b24c5ea Mon Sep 17 00:00:00 2001 From: Jae-Mo Lihm Date: Mon, 19 Aug 2024 20:28:28 +0000 Subject: [PATCH 3/7] Modularize linear response symmetry - symmetrization of density and potential --- GWW/head/solve_head.f90 | 7 +- HP/src/hp_solve_linear_system.f90 | 16 ++++ LR_Modules/efermi_shift.f90 | 11 +-- LR_Modules/lrcom.f90 | 15 +++- PHonon/CMakeLists.txt | 3 +- PHonon/PH/Makefile | 3 +- PHonon/PH/ph_set_upert.f90 | 129 ++++++++++++++++++++++++++++++ PHonon/PH/phescf.f90 | 6 ++ PHonon/PH/phqscf.f90 | 4 + PHonon/PH/psymdvscf.f90 | 63 +++++++-------- PHonon/PH/psyme.f90 | 65 --------------- PHonon/PH/solve_e.f90 | 2 +- PHonon/PH/solve_e_fpol.f90 | 2 +- PHonon/PH/solve_linter.f90 | 6 +- PHonon/PH/sym_def.f90 | 57 ++++++------- PHonon/PH/sym_dns_wrapper.f90 | 2 + PHonon/PH/symdvscf.f90 | 115 ++++++++++++++------------ PHonon/PH/syme.f90 | 78 ------------------ PHonon/PH/zstar_eu_us.f90 | 2 +- 19 files changed, 306 insertions(+), 280 deletions(-) create mode 100644 PHonon/PH/ph_set_upert.f90 delete mode 100644 PHonon/PH/psyme.f90 delete mode 100644 PHonon/PH/syme.f90 diff --git a/GWW/head/solve_head.f90 b/GWW/head/solve_head.f90 index 7001330d3..e6e55ca45 100644 --- a/GWW/head/solve_head.f90 +++ b/GWW/head/solve_head.f90 @@ -470,13 +470,14 @@ subroutine solve_head head(first_f+i-1,2)=epsilon_g(2,2,i) head(first_f+i-1,3)=epsilon_g(3,3,i) - + call ph_set_upert_e() #if defined(__MPI) call mp_sum ( pola_charge(:,:,:,i) , inter_pool_comm ) - call psyme (pola_charge(:,:,:,i)) + call psymdvscf (pola_charge(:,:,:,i)) #else - call syme (pola_charge(:,:,:,i)) + call symdvscf (pola_charge(:,:,:,i)) #endif + call ph_deallocate_upert() call create_scf_type ( wing, .true. ) do ipol=1,3 CALL fwfft ('Rho', pola_charge(1:dfftp%nnr,1,ipol,i), dfftp) diff --git a/HP/src/hp_solve_linear_system.f90 b/HP/src/hp_solve_linear_system.f90 index a828eabce..96f6d1cf0 100644 --- a/HP/src/hp_solve_linear_system.f90 +++ b/HP/src/hp_solve_linear_system.f90 @@ -55,6 +55,7 @@ SUBROUTINE hp_solve_linear_system (na, iq) USE qpoint_aux, ONLY : ikmks, ikmkmqs, becpt USE lsda_mod, ONLY : nspin USE lr_nc_mag, ONLY : lr_apply_time_reversal, deeq_nc_save, int3_nc_save + USE lr_symm_base, ONLY : lr_npert, upert, upert_mq ! IMPLICIT NONE ! @@ -107,6 +108,7 @@ SUBROUTINE hp_solve_linear_system (na, iq) ik, ikk, & ! counter on k points ndim, & is, & ! counter on spin polarizations + isym, & ! counter on symmetries npw, & ! number of plane waves at k nsolv, & ! number of linear systems isolv, & ! counter on linear systems @@ -147,6 +149,18 @@ SUBROUTINE hp_solve_linear_system (na, iq) ! ALLOCATE (dbecsum((nhm*(nhm+1))/2, nat, nspin_mag, 1)) ! + ! Set symmetry representation in lr_symm_base + ! + lr_npert = 1 + ALLOCATE(upert(lr_npert, lr_npert, nsymq)) + DO isym = 1, nsymq + upert(1, 1, isym) = (1.d0, 0.d0) + ENDDO + IF (minus_q) THEN + ALLOCATE(upert_mq(lr_npert, lr_npert)) + upert_mq(1, 1) = (1.d0, 0.d0) + ENDIF ! minus_q + ! nsolv=1 IF (noncolin.AND.domag) nsolv=2 ! @@ -497,6 +511,8 @@ SUBROUTINE hp_solve_linear_system (na, iq) DEALLOCATE (dvscfin) DEALLOCATE (dvscfout) DEALLOCATE (trace_dns_tot_old) + DEALLOCATE (upert) + IF (minus_q) DEALLOCATE(upert_mq) ! IF (ALLOCATED(dbecsum_nc)) DEALLOCATE (dbecsum_nc) IF (ALLOCATED(int3_nc)) DEALLOCATE(int3_nc) diff --git a/LR_Modules/efermi_shift.f90 b/LR_Modules/efermi_shift.f90 index a87bc03ce..ca0e55b6e 100644 --- a/LR_Modules/efermi_shift.f90 +++ b/LR_Modules/efermi_shift.f90 @@ -14,17 +14,16 @@ MODULE efermi_shift ! ! Define an abstract interface to use a callback ABSTRACT INTERFACE - SUBROUTINE def_symmetrization(def, irr) + SUBROUTINE def_symmetrization(def) USE kinds, ONLY : DP - INTEGER :: irr - COMPLEX(DP) :: def(3) + COMPLEX(DP), INTENT(inout) :: def(3) END SUBROUTINE END INTERFACE ! CONTAINS ! !----------------------------------------------------------------------- -SUBROUTINE ef_shift (npert, dos_ef, ldos, drhoscf, dbecsum, becsum1, irr, sym_def) +SUBROUTINE ef_shift (npert, dos_ef, ldos, drhoscf, dbecsum, becsum1, sym_def) !----------------------------------------------------------------------- !! This routine takes care of the effects of a shift of Ef, due to the !! perturbation, that can take place in a metal at q=0 @@ -61,8 +60,6 @@ SUBROUTINE ef_shift (npert, dos_ef, ldos, drhoscf, dbecsum, becsum1, irr, sym_de REAL(DP), INTENT(IN), OPTIONAL :: becsum1((nhm*(nhm+1))/2, nat, nspin_mag) !! becsum1 = wdelta * !! (where wdelta is a Dirac-delta-like function) - INTEGER, INTENT(IN), OPTIONAL :: irr - !! index of the current irr. rep. Used only in sym_def. PROCEDURE(def_symmetrization), OPTIONAL :: sym_def !! Symmetrization routine for the fermi energy change ! @@ -104,7 +101,7 @@ SUBROUTINE ef_shift (npert, dos_ef, ldos, drhoscf, dbecsum, becsum1, irr, sym_de ! ! symmetrizes the Fermi energy shift ! - IF (present(sym_def)) CALL sym_def(def, irr) + IF (present(sym_def)) CALL sym_def(def) WRITE( stdout, '(5x,"Pert. #",i3,": Fermi energy shift (Ry) =",2es15.4)')& (ipert, def (ipert) , ipert = 1, npert ) ! diff --git a/LR_Modules/lrcom.f90 b/LR_Modules/lrcom.f90 index 40526871e..73520c753 100644 --- a/LR_Modules/lrcom.f90 +++ b/LR_Modules/lrcom.f90 @@ -23,7 +23,7 @@ MODULE qpoint INTEGER :: nksq, npwq, nksqtot ! the real number of k points ! the number of plane waves for q - ! the total number of q points + ! the total number of q points INTEGER, ALLOCATABLE :: ikks(:), ikqs(:) ! the index of k point in the list of k ! the index of k+q point in the list of k @@ -147,6 +147,19 @@ MODULE lr_symm_base LOGICAL :: minus_q, & ! if .TRUE. there is the symmetry sending q<->-q invsymq ! if .TRUE. the small group of q has inversion ! + ! Symmetry representation of the perturbations + ! + INTEGER :: lr_npert + !! Number of perturbations considered at the same time. + !! e.g., for phonons: dimension of the irreducible representation + !! e.g., for electric fields: 3 + COMPLEX(DP), ALLOCATABLE :: upert(:, :, :) + !! Representation of the symmetry in the perturbation basis. Size (lr_npert, lr_npert, nsymq) + !! e.g., for phonons: transformation matrix of the patterns + !! e.g., for electric fields: transformation matrix of Cartesian vectors + COMPLEX(DP), ALLOCATABLE :: upert_mq(:, :) + !! Representation of the symmetry that transforms q to -q. Size (lr_npert, lr_npert) + ! END MODULE lr_symm_base ! MODULE lrus diff --git a/PHonon/CMakeLists.txt b/PHonon/CMakeLists.txt index 1309d4359..d2fd24015 100644 --- a/PHonon/CMakeLists.txt +++ b/PHonon/CMakeLists.txt @@ -89,6 +89,7 @@ set(src_ph PH/openfilq.f90 PH/phcom.f90 PH/ph_restart.f90 + PH/ph_set_upert.f90 PH/phescf.f90 PH/phq_init.f90 PH/phq_readin.f90 @@ -102,7 +103,6 @@ set(src_ph PH/prepare_sym_analysis.f90 PH/psidspsi.f90 PH/psymdvscf.f90 - PH/psyme.f90 PH/psym_dmag.f90 PH/psym_dmage.f90 PH/punch_plot_e.f90 @@ -136,7 +136,6 @@ set(src_ph PH/symdvscf.f90 PH/symdyn_munu.f90 PH/symdynph_gq.f90 - PH/syme.f90 PH/symm.f90 PH/symmorphic_or_nzb.f90 PH/swfc.f90 diff --git a/PHonon/PH/Makefile b/PHonon/PH/Makefile index 67241d38f..d42e77000 100644 --- a/PHonon/PH/Makefile +++ b/PHonon/PH/Makefile @@ -108,13 +108,13 @@ phq_recover.o \ phq_setup.o \ phq_summary.o \ phqscf.o \ +ph_set_upert.o \ polariz.o \ print_clock_ph.o \ prepare_q.o \ prepare_sym_analysis.o \ psidspsi.o \ psymdvscf.o \ -psyme.o \ psym_dmag.o \ psym_dmage.o \ punch_plot_e.o \ @@ -147,7 +147,6 @@ sym_dmage.o \ symdvscf.o \ symdyn_munu.o \ symdynph_gq.o \ -syme.o \ symm.o \ symmorphic_or_nzb.o \ swfc.o \ diff --git a/PHonon/PH/ph_set_upert.f90 b/PHonon/PH/ph_set_upert.f90 new file mode 100644 index 000000000..96896fafc --- /dev/null +++ b/PHonon/PH/ph_set_upert.f90 @@ -0,0 +1,129 @@ +! +! Copyright (C) 2001-2018 Quantum ESPRESSO group +! This file is distributed under the terms of the +! GNU General Public License. See the file `License' +! in the root directory of the present distribution, +! or http://www.gnu.org/copyleft/gpl.txt . +! +!------------------------------------------------------------------------------------------ +SUBROUTINE ph_set_upert_phonon(irr) + !-------------------------------------------------------------------------------------- + !! Set lr_npert, upert, and upert_mp for phonons in irreducible representation irr + !-------------------------------------------------------------------------------------- + ! + USE modes, ONLY : npert, t, tmq + USE control_ph, ONLY : lgamma_gamma + USE lr_symm_base, ONLY : nsymq, minus_q, lr_npert, upert, upert_mq + ! + IMPLICIT NONE + ! + INTEGER, INTENT(IN) :: irr + !! irreducible representation + ! + INTEGER :: ipert, jpert + !! Counter on perturbations + INTEGER :: isym + !! Counter on symmetries + ! + ! Set symmetry representation in lr_symm_base + ! + lr_npert = npert(irr) + ! + IF (lgamma_gamma) THEN + ! + ! If lgamma_gamma is true, symmetrization is not used. + ! Set upert and upert_mq to 1. + ! + IF (lr_npert /= 1) CALL errore('ph_set_upert_phonon', & + 'lgamma_gamma is true, but lr_npert /= 1', 1) + ! + ALLOCATE(upert(1, 1, 1)) + upert(1, 1, 1) = (1.d0, 0.d0) + ! + IF (minus_q) THEN + ALLOCATE(upert_mq(1, 1)) + upert_mq(1, 1) = (1.d0, 0.d0) + ENDIF + ! + ELSE + ! + ALLOCATE(upert(lr_npert, lr_npert, nsymq)) + ! + DO isym = 1, nsymq + DO ipert = 1, lr_npert + DO jpert = 1, lr_npert + upert(jpert, ipert, isym) = t(jpert, ipert, isym, irr) + ENDDO + ENDDO + ENDDO + ! + IF (minus_q) THEN + ALLOCATE(upert_mq(lr_npert, lr_npert)) + DO ipert = 1, lr_npert + DO jpert = 1, lr_npert + upert_mq(jpert, ipert) = tmq(jpert, ipert, irr) + ENDDO + ENDDO + ENDIF ! minus_q + ! + ENDIF + ! +END SUBROUTINE ph_set_upert_phonon +!----------------------------------------------------------------------------------------- +! +!----------------------------------------------------------------------------------------- +SUBROUTINE ph_set_upert_e() + !-------------------------------------------------------------------------------------- + !! Set lr_npert, upert, and upert_mp for electric field perturbation. + !-------------------------------------------------------------------------------------- + ! + USE symm_base, ONLY : s + USE lr_symm_base, ONLY : nsymq, minus_q, lr_npert, upert, upert_mq + ! + IMPLICIT NONE + ! + INTEGER :: ipol, jpol + !! Counter on perturbations + INTEGER :: isym + !! Counter on symmetries + ! + ! Set symmetry representation in lr_symm_base + ! + lr_npert = 3 + ! + ALLOCATE(upert(lr_npert, lr_npert, nsymq)) + ! + DO isym = 1, nsymq + DO ipol = 1, 3 + DO jpol = 1, 3 + upert(jpol, ipol, isym) = s(ipol, jpol, isym) + ENDDO + ENDDO + ENDDO + ! + IF (minus_q) THEN + ! + ! upert_mq is the rotation matrix for symmetry S such that T * S * q = q + G. + ! E field perturbation is applied only for q = 0, where T*q = q, i.e., S = identity. + ! Thus, upert_mq is the 3*3 identity matrix. + ! + ALLOCATE(upert_mq(lr_npert, lr_npert)) + upert_mq = (0.d0, 0.d0) + DO ipol = 1, lr_npert + upert_mq(ipol, ipol) = (1.d0, 0.d0) + ENDDO + ENDIF ! minus_q + ! +END SUBROUTINE ph_set_upert_e +!---------------------------------------------------------------------------------------- +! +!---------------------------------------------------------------------------------------- +SUBROUTINE ph_deallocate_upert() +!---------------------------------------------------------------------------------------- + USE lr_symm_base, ONLY : nsymq, minus_q, lr_npert, upert, upert_mq + IMPLICIT NONE + DEALLOCATE(upert) + IF (minus_q) DEALLOCATE(upert_mq) +!---------------------------------------------------------------------------------------- +END SUBROUTINE ph_deallocate_upert +!---------------------------------------------------------------------------------------- diff --git a/PHonon/PH/phescf.f90 b/PHonon/PH/phescf.f90 index 4212777aa..7e97aa7ed 100644 --- a/PHonon/PH/phescf.f90 +++ b/PHonon/PH/phescf.f90 @@ -51,6 +51,10 @@ SUBROUTINE phescf() IF (noncolin) ALLOCATE(int3_nc( nhm, nhm, nat, nspin, 3)) ENDIF ! + ! Set symmetry representation in lr_symm_base + ! + CALL ph_set_upert_e() + ! ! DFPT+U: dnsscf in the electric field calculation ! is the scf change of atomic occupations ns induced by the electric field. ! dnsscf_all_modes = dnsscf because nirr=1, number of perturbations = 3. @@ -134,6 +138,8 @@ SUBROUTINE phescf() IF (noncolin) DEALLOCATE(int3_nc) ENDIF ! + CALL ph_deallocate_upert() + ! ! DFPT+U ! IF (lda_plus_u) THEN diff --git a/PHonon/PH/phqscf.f90 b/PHonon/PH/phqscf.f90 index 61d0dc9e8..0933ebe2e 100644 --- a/PHonon/PH/phqscf.f90 +++ b/PHonon/PH/phqscf.f90 @@ -72,6 +72,8 @@ SUBROUTINE phqscf DO irr = 1, nirr IF ( (comp_irr (irr)) .AND. (.NOT.done_irr (irr)) ) THEN npe=npert(irr) + CALL ph_set_upert_phonon(irr) + ! ALLOCATE (drhoscfs( dfftp%nnr , nspin_mag, npe)) imode0 = 0 DO irr1 = 1, irr - 1 @@ -144,6 +146,8 @@ SUBROUTINE phqscf IF (okpaw) DEALLOCATE (int3_paw) IF (noncolin) DEALLOCATE(int3_nc) ENDIF + CALL ph_deallocate_upert() + ! tcpu = get_clock ('PHONON') ! DEALLOCATE (drhoscfs) diff --git a/PHonon/PH/psymdvscf.f90 b/PHonon/PH/psymdvscf.f90 index edd6e828e..5714dd6af 100644 --- a/PHonon/PH/psymdvscf.f90 +++ b/PHonon/PH/psymdvscf.f90 @@ -7,25 +7,23 @@ ! ! !----------------------------------------------------------------------- -SUBROUTINE psymdvscf (nper, irr, dvtosym) +SUBROUTINE psymdvscf (dvtosym) !----------------------------------------------------------------------- !! p-symmetrize the potential. + !! + !! The real space points of dv is distributed, but symmetry may map a point in one + !! core to a point in a different core. Hence, gather dvscf in real space, symmetrize, + !! and then scatter back. ! - USE kinds, ONLY : DP + USE kinds, ONLY : DP + USE fft_base, ONLY : dfftp USE noncollin_module, ONLY : nspin_mag - USE mp_bands, ONLY : me_bgrp - USE fft_base, ONLY : dfftp - USE scatter_mod, ONLY : cgather_sym - - USE lr_symm_base, ONLY : nsymq, minus_q + USE scatter_mod, ONLY : cgather_sym + USE lr_symm_base, ONLY : nsymq, minus_q, lr_npert ! IMPLICIT NONE ! - INTEGER :: nper - !! the number of perturbations - INTEGER :: irr - !! the representation under consideration - COMPLEX(DP) :: dvtosym(dfftp%nnr,nspin_mag,nper) + COMPLEX(DP) :: dvtosym(dfftp%nnr, nspin_mag, lr_npert) !! the potential to symmetrize ! ! ... local variable @@ -33,27 +31,30 @@ SUBROUTINE psymdvscf (nper, irr, dvtosym) #if defined (__MPI) ! INTEGER :: i, is, iper, ir3, ioff, ioff_tg, nxyp - + ! COMPLEX(DP), ALLOCATABLE :: ddvtosym (:,:,:) ! the potential to symm - - - IF (nsymq.EQ.1.AND. (.NOT.minus_q) ) RETURN + IF (nsymq == 1 .AND. (.NOT.minus_q) ) RETURN CALL start_clock ('psymdvscf') - - ALLOCATE (ddvtosym ( dfftp%nr1x * dfftp%nr2x * dfftp%nr3x, nspin_mag, nper)) - - DO iper = 1, nper + ! + ALLOCATE (ddvtosym ( dfftp%nr1x * dfftp%nr2x * dfftp%nr3x, nspin_mag, lr_npert)) + ! + ! Gather real-space points + ! + DO iper = 1, lr_npert DO is = 1, nspin_mag CALL cgather_sym (dfftp, dvtosym (:, is, iper), ddvtosym (:, is, iper) ) ENDDO - ENDDO - - CALL symdvscf (nper, irr, ddvtosym) - + ! + ! Symmetrize + ! + CALL symdvscf (ddvtosym) + ! + ! Scatter back the real-space points + ! nxyp = dfftp%nr1x * dfftp%my_nr2p - DO iper = 1, nper + DO iper = 1, lr_npert DO is = 1, nspin_mag DO ir3 = 1, dfftp%my_nr3p ioff = dfftp%nr1x * dfftp%my_nr2p * (ir3-1) @@ -63,14 +64,12 @@ SUBROUTINE psymdvscf (nper, irr, dvtosym) ENDDO ENDDO DEALLOCATE (ddvtosym) - + ! CALL stop_clock ('psymdvscf') #else - - CALL symdvscf (nper, irr, dvtosym) - + ! + CALL symdvscf(dvtosym) + ! #endif - - RETURN - + ! END SUBROUTINE psymdvscf diff --git a/PHonon/PH/psyme.f90 b/PHonon/PH/psyme.f90 deleted file mode 100644 index 9cde98fad..000000000 --- a/PHonon/PH/psyme.f90 +++ /dev/null @@ -1,65 +0,0 @@ -! -! Copyright (C) 2001-2008 Quantum ESPRESSO group -! This file is distributed under the terms of the -! GNU General Public License. See the file `License' -! in the root directory of the present distribution, -! or http://www.gnu.org/copyleft/gpl.txt . -! -! -!----------------------------------------------------------------------- -SUBROUTINE psyme (dvtosym) - !----------------------------------------------------------------------- - !! p-symmetrize the charge density. - ! - USE kinds, ONLY : DP - USE fft_base, ONLY : dfftp - USE noncollin_module, ONLY : nspin_mag - USE mp_bands, ONLY : me_bgrp - USE fft_base, ONLY : dfftp - USE scatter_mod, ONLY : cgather_sym - ! - IMPLICIT NONE - ! - COMPLEX(DP) :: dvtosym (dfftp%nnr, nspin_mag, 3) - !! the potential to symmetrize - ! - ! ... local variables - ! -#if defined (__MPI) - ! - INTEGER :: i, is, iper, ir3, ioff, ioff_tg, nxyp - COMPLEX(DP), ALLOCATABLE :: ddvtosym (:,:,:) - ! the potential to symmet - ! - ! - ALLOCATE (ddvtosym ( dfftp%nr1x * dfftp%nr2x * dfftp%nr3x, nspin_mag, 3)) - - DO iper = 1, 3 - DO is = 1, nspin_mag - CALL cgather_sym (dfftp,dvtosym (:, is, iper), ddvtosym (:, is, iper) ) - ENDDO - - ENDDO - - CALL syme (ddvtosym) - - nxyp = dfftp%nr1x * dfftp%my_nr2p - DO iper = 1, 3 - DO is = 1, nspin_mag - DO ir3 = 1, dfftp%my_nr3p - ioff = dfftp%nr1x * dfftp%my_nr2p * (ir3-1) - ioff_tg = dfftp%nr1x * dfftp%nr2x * (dfftp%my_i0r3p+ir3-1) + dfftp%nr1x * dfftp%my_i0r2p - CALL zcopy (nxyp, ddvtosym (ioff_tg+1, is, iper), 1, dvtosym (ioff+1, is, iper), 1) - END DO - ENDDO - ENDDO - - DEALLOCATE (ddvtosym) - -#else - CALL syme (dvtosym) -#endif - - RETURN - -END SUBROUTINE psyme diff --git a/PHonon/PH/solve_e.f90 b/PHonon/PH/solve_e.f90 index 51fbe87f0..edfd4c866 100644 --- a/PHonon/PH/solve_e.f90 +++ b/PHonon/PH/solve_e.f90 @@ -234,7 +234,7 @@ subroutine solve_e call mp_sum ( dvscfout, inter_pool_comm ) IF (okpaw) call mp_sum ( dbecsum, inter_pool_comm ) if (.not.lgamma_gamma) then - call psyme (dvscfout) + call psymdvscf(dvscfout) IF ( noncolin.and.domag ) CALL psym_dmage(dvscfout) endif ! diff --git a/PHonon/PH/solve_e_fpol.f90 b/PHonon/PH/solve_e_fpol.f90 index 9d7226352..4ad7abc38 100644 --- a/PHonon/PH/solve_e_fpol.f90 +++ b/PHonon/PH/solve_e_fpol.f90 @@ -315,7 +315,7 @@ subroutine solve_e_fpol( iw ) ! for the three polarizations - symmetrize it ! call mp_sum ( dvscfout, inter_pool_comm ) - call psyme (dvscfout) + call psymdvscf(dvscfout) ! ! save the symmetrized linear charge response to file ! calculate the corresponding linear potential response diff --git a/PHonon/PH/solve_linter.f90 b/PHonon/PH/solve_linter.f90 index 1c5804106..61cabf1d5 100644 --- a/PHonon/PH/solve_linter.f90 +++ b/PHonon/PH/solve_linter.f90 @@ -395,9 +395,9 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf) ! IF (lmetq0) THEN IF (okpaw) THEN - CALL ef_shift(npe, dos_ef, ldos, drhoscfh, dbecsum, becsum1, irr, sym_def) + CALL ef_shift(npe, dos_ef, ldos, drhoscfh, dbecsum, becsum1, sym_def) ELSE - CALL ef_shift(npe, dos_ef, ldos, drhoscfh, irr=irr, sym_def=sym_def) + CALL ef_shift(npe, dos_ef, ldos, drhoscfh, sym_def=sym_def) ENDIF ENDIF ! @@ -406,7 +406,7 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf) ! Here we symmetrize them ... ! IF (.not.lgamma_gamma) THEN - call psymdvscf (npe, irr, drhoscfh) + CALL psymdvscf(drhoscfh) IF ( noncolin.and.domag ) CALL psym_dmag( npe, irr, drhoscfh) IF (okpaw) THEN IF (minus_q) CALL PAW_dumqsymmetrize(dbecsum,npe,irr, npertx,irotmq,rtau,xq,tmq) diff --git a/PHonon/PH/sym_def.f90 b/PHonon/PH/sym_def.f90 index 9508301b0..f40d432b6 100644 --- a/PHonon/PH/sym_def.f90 +++ b/PHonon/PH/sym_def.f90 @@ -8,7 +8,7 @@ !--------------------------------------------------------------------- MODULE sym_def_module CONTAINS -subroutine sym_def (def, irr) +SUBROUTINE sym_def(def) !--------------------------------------------------------------------- !! Symmetrizes the first order changes of the Fermi energies of an !! irreducible representation. These objects are defined complex because @@ -16,60 +16,52 @@ subroutine sym_def (def, irr) ! !! Used in the q=0 metallic case only. ! - USE kinds, only : DP - USE modes, ONLY : npert, t, tmq - USE control_ph, ONLY : lgamma_gamma - - USE lr_symm_base, ONLY : minus_q, nsymq - - implicit none - - integer :: irr - !! input: the representation under consideration - complex(DP) :: def(3) - !! inp/out: the fermi energy changes. + USE kinds, ONLY : DP + USE control_ph, ONLY : lgamma_gamma + USE lr_symm_base, ONLY : minus_q, nsymq, lr_npert, upert, upert_mq + ! + IMPLICIT NONE + ! + COMPLEX(DP), INTENT(inout) :: def(3) + !! inp/out: the fermi energy changes. !! NB: def(3) should be def(npertx), but it is used only at Gamma !! where the dimension of irreps never exceeds 3. ! ! ... local variables ! - integer :: ipert, jpert, isym, irot + INTEGER :: ipert, jpert, isym ! counter on perturbations ! counter on perturbations ! counter on symmetries - ! the rotation - - complex(DP) :: w_def(3) + ! + COMPLEX(DP) :: w_def(3) ! the fermi energy changes (work array) - + ! IF (lgamma_gamma) RETURN if (nsymq == 1 .and. (.not.minus_q) ) return - if (npert(irr) > 3) CALL errore("sym_def", "npert(irr) exceeds 3", 1) + if (lr_npert > 3) CALL errore("sym_def", "lr_npert cannot exceed 3 at q=0", 1) ! ! first the symmetrization S(irotmq)*q = -q + Gi if necessary ! if (minus_q) then w_def = (0.d0, 0.d0) - do ipert = 1, npert (irr) - do jpert = 1, npert (irr) - w_def (ipert) = w_def (ipert) + tmq (jpert, ipert, irr) & - * def (jpert) + do ipert = 1, lr_npert + do jpert = 1, lr_npert + w_def(ipert) = w_def(ipert) + upert_mq(jpert, ipert) * def(jpert) enddo enddo - do ipert = 1, npert (irr) - def (ipert) = 0.5d0 * (def (ipert) + CONJG(w_def (ipert) ) ) + do ipert = 1, lr_npert + def(ipert) = 0.5d0 * (def(ipert) + CONJG(w_def(ipert)) ) enddo endif ! ! Here we symmetrize with respect to the small group of q ! w_def = (0.d0, 0.d0) - do ipert = 1, npert (irr) + do ipert = 1, lr_npert do isym = 1, nsymq - irot = isym - do jpert = 1, npert (irr) - w_def (ipert) = w_def (ipert) + t (jpert, ipert, irot, irr) & - * def (jpert) + do jpert = 1, lr_npert + w_def(ipert) = w_def(ipert) + upert(jpert, ipert, isym) * def(jpert) enddo enddo enddo @@ -77,7 +69,6 @@ subroutine sym_def (def, irr) ! normalize and exit ! def = w_def / DBLE(nsymq) - - return -end subroutine sym_def + ! +END SUBROUTINE sym_def END MODULE sym_def_module diff --git a/PHonon/PH/sym_dns_wrapper.f90 b/PHonon/PH/sym_dns_wrapper.f90 index 1fe32606e..86a7bce85 100644 --- a/PHonon/PH/sym_dns_wrapper.f90 +++ b/PHonon/PH/sym_dns_wrapper.f90 @@ -55,6 +55,7 @@ SUBROUTINE sym_dns_wrapper (ldim, dns_cart, dns_pattern) npe = npert(irr) ! allocate ALLOCATE (dns_aux(ldim,ldim,nspin,nat,npe)) + CALL ph_set_upert_phonon(irr) ! pack dns_aux(:,:,:,:,1:npe) = dns_pattern(:,:,:,:,imode0:imode0-1+npe) ! symmetrize @@ -63,6 +64,7 @@ SUBROUTINE sym_dns_wrapper (ldim, dns_cart, dns_pattern) dns_pattern(:,:,:,:,imode0:imode0-1+npe) = dns_aux(:,:,:,:,1:npe) ! deallocate DEALLOCATE (dns_aux) + CALL ph_deallocate_upert() ! advance the counter imode0 = imode0 + npe ENDDO diff --git a/PHonon/PH/symdvscf.f90 b/PHonon/PH/symdvscf.f90 index 6d241dd5c..179d30025 100644 --- a/PHonon/PH/symdvscf.f90 +++ b/PHonon/PH/symdvscf.f90 @@ -7,7 +7,7 @@ ! ! !--------------------------------------------------------------------- -subroutine symdvscf (nper, irr, dvtosym) +subroutine symdvscf (dvtosym) !--------------------------------------------------------------------- !! Symmetrize the self-consistent potential of the perturbations !! belonging to an irreducible representation. @@ -22,16 +22,12 @@ subroutine symdvscf (nper, irr, dvtosym) USE cell_base, ONLY : at USE symm_base, ONLY : s, ft, t_rev USE noncollin_module, ONLY : nspin_lsda, nspin_mag - USE modes, ONLY : t, tmq - USE lr_symm_base, ONLY : minus_q, irotmq, nsymq, gi, gimq + USE control_lr, ONLY : lgamma + USE lr_symm_base, ONLY : minus_q, irotmq, nsymq, gi, gimq, lr_npert, upert, upert_mq ! implicit none ! - integer :: nper - !! the number of perturbations - integer :: irr - !! the representation under conside - complex(DP) :: dvtosym(dfftp%nr1x,dfftp%nr2x,dfftp%nr3x,nspin_mag,nper) + complex(DP) :: dvtosym(dfftp%nr1x,dfftp%nr2x,dfftp%nr3x, nspin_mag, lr_npert) !! the potential to be symmetrized ! ! ... local variables @@ -48,54 +44,74 @@ subroutine symdvscf (nper, irr, dvtosym) ! auxiliary space ! the multiplication factor ! the phase factor - - if (nsymq == 1.and. (.not.minus_q) ) return - call start_clock ('symdvscf') - - allocate (dvsym( dfftp%nr1x , dfftp%nr2x , dfftp%nr3x , nper)) - allocate (add_dvsym(nper)) ! - ! if necessary we symmetrize with respect to S(irotmq)*q = -q + Gi + ! If there is no symmetry other than identity, return. + IF (nsymq == 1 .AND. (.NOT.minus_q) ) RETURN + ! + CALL start_clock ('symdvscf') + ! + ALLOCATE(dvsym(dfftp%nr1x, dfftp%nr2x, dfftp%nr3x, lr_npert)) + ALLOCATE(add_dvsym(lr_npert)) ! n(1) = tpi / DBLE (dfftp%nr1) n(2) = tpi / DBLE (dfftp%nr2) n(3) = tpi / DBLE (dfftp%nr3) - + ! CALL scale_sym_ops( nsymq, s, ft, dfftp%nr1, dfftp%nr2, dfftp%nr3, & s_scaled, ftau ) - - if (minus_q) then - gf(:) = gimq (1) * at (1, :) * n(:) + & - gimq (2) * at (2, :) * n(:) + & - gimq (3) * at (3, :) * n(:) - term (:, 1) = CMPLX(cos (gf (:) ), sin (gf (:) ) ,kind=DP) - do is = 1, nspin_lsda - phase (1) = (1.d0, 0.d0) - do k = 1, dfftp%nr3 - do j = 1, dfftp%nr2 - do i = 1, dfftp%nr1 - CALL rotate_grid_point(s_scaled(1,1,irotmq), ftau(1,irotmq), & - i, j, k, dfftp%nr1, dfftp%nr2, dfftp%nr3, ri, rj, rk) - do ipert = 1, nper - aux2 = (0.d0, 0.d0) - do jpert = 1, nper - aux2 = aux2 + tmq (jpert, ipert, irr) * & - dvtosym (ri, rj, rk, is, jpert) * phase (1) + ! + ! if necessary we symmetrize with respect to S(irotmq)*q = -q + Gi + ! (time reversal + spatial symmetry S(irotmq)) + ! + IF (minus_q) THEN + IF (lgamma) THEN + ! + ! Special case: q = Gamma. S(irotmq) = I, Gi = 0. + ! + DO is = 1, nspin_lsda + DO ipert = 1, lr_npert + dvtosym(:,:,:,is,ipert) = CMPLX(DBLE(dvtosym(:,:,:,is,ipert)), 0.d0, KIND=DP) + ENDDO + ENDDO + ! + ELSE ! .NOT. lgamma + gf(:) = gimq (1) * at (1, :) * n(:) + & + gimq (2) * at (2, :) * n(:) + & + gimq (3) * at (3, :) * n(:) + term (:, 1) = CMPLX(cos (gf (:) ), sin (gf (:) ) ,kind=DP) + do is = 1, nspin_lsda + phase (1) = (1.d0, 0.d0) + do k = 1, dfftp%nr3 + do j = 1, dfftp%nr2 + do i = 1, dfftp%nr1 + CALL rotate_grid_point(s_scaled(1,1,irotmq), ftau(1,irotmq), & + i, j, k, dfftp%nr1, dfftp%nr2, dfftp%nr3, ri, rj, rk) + do ipert = 1, lr_npert + aux2 = (0.d0, 0.d0) + do jpert = 1, lr_npert + aux2 = aux2 + upert_mq(jpert, ipert) * & + dvtosym (ri, rj, rk, is, jpert) * phase (1) + enddo + dvsym (i, j, k, ipert) = (dvtosym (i, j, k, is, ipert) +& + CONJG(aux2) ) * 0.5d0 enddo - dvsym (i, j, k, ipert) = (dvtosym (i, j, k, is, ipert) +& - CONJG(aux2) ) * 0.5d0 + phase (1) = phase (1) * term (1, 1) enddo - phase (1) = phase (1) * term (1, 1) + phase (1) = phase (1) * term (2, 1) enddo - phase (1) = phase (1) * term (2, 1) + phase (1) = phase (1) * term (3, 1) + enddo + do ipert = 1, lr_npert + dvtosym(:, :, :, is, ipert) = dvsym (:, :, :, ipert) enddo - phase (1) = phase (1) * term (3, 1) enddo - do ipert = 1, nper - dvtosym(:, :, :, is, ipert) = dvsym (:, :, :, ipert) - enddo - enddo - endif + ENDIF ! lgamma + ENDIF ! minus_q + ! + ! If there no spatial symmetry other than identity, return. + ! + IF (nsymq == 1) RETURN + ! ! ! Here we symmetrize with respect to the small group of q ! @@ -119,13 +135,10 @@ subroutine symdvscf (nper, irr, dvtosym) CALL rotate_grid_point(s_scaled(1,1,irot), ftau(1,irot), & i, j, k, dfftp%nr1, dfftp%nr2, dfftp%nr3, ri, rj, rk) add_dvsym(:) = (0.d0, 0.d0) - do ipert = 1, nper - do jpert = 1, nper - add_dvsym(ipert) = add_dvsym(ipert) + t (jpert, ipert, irot, irr) * & + do ipert = 1, lr_npert + do jpert = 1, lr_npert + add_dvsym(ipert) = add_dvsym(ipert) + upert(jpert, ipert, irot) * & dvtosym (ri, rj, rk, is, jpert) * phase (isym) - !dvsym (i, j, k, ipert) = dvsym (i, j, k, ipert) + & - ! t (jpert, ipert, irot, irr) * & - ! dvtosym (ri, rj, rk, is, jpert) * phase (isym) enddo enddo if (t_rev(isym)==0) then @@ -147,7 +160,7 @@ subroutine symdvscf (nper, irr, dvtosym) enddo enddo - do ipert = 1, nper + do ipert = 1, lr_npert dvtosym(:,:,:,is,ipert) = dvsym(:,:,:,ipert) / DBLE (nsymq) enddo diff --git a/PHonon/PH/syme.f90 b/PHonon/PH/syme.f90 deleted file mode 100644 index d577c81c5..000000000 --- a/PHonon/PH/syme.f90 +++ /dev/null @@ -1,78 +0,0 @@ -! -! Copyright (C) 2001-2008 Quantum ESPRESSO group -! This file is distributed under the terms of the -! GNU General Public License. See the file `License' -! in the root directory of the present distribution, -! or http://www.gnu.org/copyleft/gpl.txt . -! -! -!--------------------------------------------------------------------- -subroutine syme (dvsym) - !--------------------------------------------------------------------- - !! This routine symmetrize the change of the potential due to an - !! electric field perturbation. It is assumed that the perturbations - !! are on the basis of the crystal. - ! - USE fft_base, only : dfftp - USE symm_base, only : nsym, s, ft - USE noncollin_module, only : nspin_lsda, nspin_mag - USE kinds, only : DP - - implicit none - - complex(DP) :: dvsym(dfftp%nr1x,dfftp%nr2x,dfftp%nr3x,nspin_mag,3) - !! the potential to symmetrize - - ! ... local variables - - complex(DP), allocatable :: aux(:,:,:,:) - ! auxiliary quantity - - integer :: ftau(3,nsym), s_scaled(3,3,nsym) - integer :: is, ri, rj, rk, i, j, k, irot, ipol, jpol - ! counter on spin polarization - ! the rotated points - ! the point - ! counter on symmetries - ! counter on polarizations - - do is = 1, nspin_lsda - do ipol = 1, 3 - dvsym(:,:,:,is,ipol) = CMPLX(DBLE(dvsym(:,:,:,is,ipol)),0.d0,kind=DP) - end do - end do - if (nsym == 1) return - CALL scale_sym_ops( nsym, s, ft, dfftp%nr1, dfftp%nr2, dfftp%nr3, & - s_scaled, ftau ) - allocate (aux(dfftp%nr1x , dfftp%nr2x , dfftp%nr3x , 3)) - do is = 1, nspin_lsda - do ipol = 1, 3 - aux(:,:,:,ipol) = dvsym(:,:,:,is,ipol) - dvsym(:,:,:,is,ipol) = (0.d0, 0.d0) - enddo - ! - ! symmmetrize - ! - do k = 1, dfftp%nr3 - do j = 1, dfftp%nr2 - do i = 1, dfftp%nr1 - do irot = 1, nsym - CALL rotate_grid_point(s_scaled(1,1,irot), ftau(1,irot), & - i, j, k, dfftp%nr1, dfftp%nr2, dfftp%nr3, ri, rj, rk) - do ipol = 1, 3 - do jpol = 1, 3 - dvsym(i,j,k,is,ipol) = dvsym(i,j,k,is,ipol) + & - s(ipol,jpol,irot) * aux(ri,rj,rk,jpol) - enddo - enddo - enddo - enddo - enddo - enddo - do ipol = 1, 3 - dvsym(:,:,:,is,ipol) = dvsym(:,:,:,is,ipol) / DBLE(nsym) - enddo - enddo - deallocate (aux) - return -end subroutine syme diff --git a/PHonon/PH/zstar_eu_us.f90 b/PHonon/PH/zstar_eu_us.f90 index 9bc812f7c..36378d3c4 100644 --- a/PHonon/PH/zstar_eu_us.f90 +++ b/PHonon/PH/zstar_eu_us.f90 @@ -149,7 +149,7 @@ subroutine zstar_eu_us ! call dv_of_drho (dvscf (:, :, ipol), .false.) enddo - call psyme (dvscf) + call psymdvscf(dvscf) #ifdef TIMINIG_ZSTAR_US call stop_clock('zstar_us_3') From 29b99d48a9b81e2127c5fb17973d17d4be38e85e Mon Sep 17 00:00:00 2001 From: Ivan Carnimeo Date: Mon, 19 Aug 2024 12:32:02 +0200 Subject: [PATCH 4/7] Remove wrappers in CG --- KS_Solvers/CG/ccgdiagg_gpu.f90 | 50 ++++++---------------------------- KS_Solvers/CG/rcgdiagg_gpu.f90 | 42 ++++++++-------------------- 2 files changed, 20 insertions(+), 72 deletions(-) diff --git a/KS_Solvers/CG/ccgdiagg_gpu.f90 b/KS_Solvers/CG/ccgdiagg_gpu.f90 index b73ed651e..0e007a155 100644 --- a/KS_Solvers/CG/ccgdiagg_gpu.f90 +++ b/KS_Solvers/CG/ccgdiagg_gpu.f90 @@ -8,38 +8,6 @@ ! #define ZERO ( 0.D0, 0.D0 ) #define ONE ( 1.D0, 0.D0 ) - -FUNCTION KSDdot( n, A, incx, B, incy) result( res ) - ! - USE util_param, ONLY : DP -#if defined(__CUDA) - USE cudafor - USE cublas -#endif - ! - IMPLICIT NONE - ! - INTEGER, INTENT(IN) :: incx,incy,n - ! -#if defined(__CUDA) - REAL(DP), DEVICE, INTENT(IN) :: A(n), B(n) -#else - REAL(DP), INTENT(IN) :: A(n), B(n) - REAL(DP), EXTERNAL :: ddot -#endif - ! - REAL(DP) :: res - ! -#if defined(__CUDA) - res = cublasDdot( n, A, incx, B, incy ) -#else - res = ddot( n, A, incx, B, incy ) -#endif - ! - RETURN - ! -END FUNCTION KSDdot - ! define __VERBOSE to print a message after each eigenvalue is computed ! !---------------------------------------------------------------------------- @@ -102,7 +70,7 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, & ! ! ... external functions ! - REAL (DP), EXTERNAL :: ksDdot + REAL (DP), EXTERNAL :: MYDDOT EXTERNAL hs_1psi_ptr, s_1psi_ptr ! hs_1psi( npwx, npw, psi, hpsi, spsi ) ! s_1psi( npwx, npw, psi, spsi ) @@ -249,7 +217,7 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, & ! ... NB: ddot(2*npw,a,1,b,1) = REAL( zdotc(npw,a,1,b,1) ) ! !$acc host_data use_device(psi, hpsi) - e(m) = ksDdot( kdim2, psi(1,m), 1, hpsi, 1 ) + e(m) = MYDDOT( kdim2, psi(1,m), 1, hpsi, 1 ) !$acc end host_data ! CALL mp_sum( e(m), intra_bgrp_comm ) @@ -271,8 +239,8 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, & ! ... ppsi is now S P(P^2)|y> = S P^2|psi>) ! !$acc host_data use_device(spsi, g, ppsi) - es(1) = ksDdot( kdim2, spsi(1), 1, g(1), 1 ) - es(2) = ksDdot( kdim2, spsi(1), 1, ppsi(1), 1 ) + es(1) = MYDDOT( kdim2, spsi(1), 1, g(1), 1 ) + es(2) = MYDDOT( kdim2, spsi(1), 1, ppsi(1), 1 ) !$acc end host_data ! CALL mp_sum( es , intra_bgrp_comm ) @@ -325,7 +293,7 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, & ! ... gg1 is (used in Polak-Ribiere formula) ! !$acc host_data use_device(g) - gg1 = ksDdot( kdim2, g(1), 1, g0_d(1), 1 ) + gg1 = MYDDOT( kdim2, g(1), 1, g0_d(1), 1 ) !$acc end host_data ! CALL mp_sum( gg1, intra_bgrp_comm ) @@ -341,7 +309,7 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, & !$acc end kernels ! !$acc host_data use_device(g) - gg = ksDdot( kdim2, g(1), 1, g0_d(1), 1 ) + gg = MYDDOT( kdim2, g(1), 1, g0_d(1), 1 ) !$acc end host_data ! CALL mp_sum( gg, intra_bgrp_comm ) @@ -396,7 +364,7 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, & CALL hs_1psi_ptr( npwx, npw, cg(1), ppsi(1), scg(1) ) ! !$acc host_data use_device(scg, cg) - cg0 = ksDdot( kdim2, cg(1), 1, scg(1), 1 ) + cg0 = MYDDOT( kdim2, cg(1), 1, scg(1), 1 ) !$acc end host_data ! CALL mp_sum( cg0 , intra_bgrp_comm ) @@ -412,13 +380,13 @@ SUBROUTINE ccgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, & ! ... = 1 ! !$acc host_data use_device(psi, ppsi) - a0 = 2.D0 * ksDdot( kdim2, psi(1,m), 1, ppsi(1), 1 ) / cg0 + a0 = 2.D0 * MYDDOT( kdim2, psi(1,m), 1, ppsi(1), 1 ) / cg0 !$acc end host_data ! CALL mp_sum( a0 , intra_bgrp_comm ) ! !$acc host_data use_device(cg, ppsi) - b0 = ksDdot( kdim2, cg(1), 1, ppsi(1), 1 ) / cg0**2 + b0 = MYDDOT( kdim2, cg(1), 1, ppsi(1), 1 ) / cg0**2 !$acc end host_data ! CALL mp_sum( b0 , intra_bgrp_comm ) diff --git a/KS_Solvers/CG/rcgdiagg_gpu.f90 b/KS_Solvers/CG/rcgdiagg_gpu.f90 index c3f288237..8aeab6e4b 100644 --- a/KS_Solvers/CG/rcgdiagg_gpu.f90 +++ b/KS_Solvers/CG/rcgdiagg_gpu.f90 @@ -5,26 +5,6 @@ ! in the root directory of the present distribution, ! or http://www.gnu.org/copyleft/gpl.txt . ! - - -SUBROUTINE cgcudaDGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) -#if defined(__CUDA) - use cudafor - use cublas -#endif - IMPLICIT NONE - DOUBLE PRECISION :: ALPHA,BETA - INTEGER :: INCX,INCY,LDA,M,N - CHARACTER :: TRANS - DOUBLE PRECISION :: A(LDA,*),X(*),Y(*) -#if defined(__CUDA) - attributes(device) :: A, X, Y -#endif - ! - call DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) - ! -END SUBROUTINE cgcudaDGEMV - ! define __VERBOSE to print a message after each eigenvalue is computed !---------------------------------------------------------------------------- SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, & @@ -87,7 +67,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, & ! ! ... external functions ! - REAL(DP), EXTERNAL :: ksDdot + REAL(DP), EXTERNAL :: MYDDOT EXTERNAL hs_1psi_ptr, s_1psi_ptr ! hs_1psi( npwx, npw, psi, hpsi, spsi ) ! s_1psi( npwx, npw, psi, spsi ) @@ -148,7 +128,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, & lagrange = 0.d0 if(m_start.le.m_end) then !$acc host_data use_device(psi, spsi) - CALL cgcudaDGEMV( 'T', npw2, m_end-m_start+1, 2.D0, psi(1,m_start), npwx2, spsi, 1, 0.D0, lagrange_d(m_start), 1 ) + CALL MYDGEMV( 'T', npw2, m_end-m_start+1, 2.D0, psi(1,m_start), npwx2, spsi, 1, 0.D0, lagrange_d(m_start), 1 ) !$acc end host_data endif if(m_start.le.m_end) lagrange( m_start:m_end ) = lagrange_d( m_start:m_end ) @@ -202,7 +182,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, & ! ... NB: ddot(2*npw,a,1,b,1) = DBLE( zdotc(npw,a,1,b,1) ) ! !$acc host_data use_device(psi, hpsi) - e(m) = 2.D0 * ksDdot( npw2, psi(1,m), 1, hpsi, 1 ) + e(m) = 2.D0 * MYDDOT( npw2, psi(1,m), 1, hpsi, 1 ) !$acc end host_data !print *, 'e(m)', e(m) IF ( gstart == 2 ) THEN @@ -232,8 +212,8 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, & ! ... ppsi is now S P(P^2)|y> = S P^2|psi>) ! !$acc host_data use_device(spsi, g, ppsi) - es(1) = 2.D0 * ksDdot( npw2, spsi(1), 1, g(1), 1 ) - es(2) = 2.D0 * ksDdot( npw2, spsi(1), 1, ppsi(1), 1 ) + es(1) = 2.D0 * MYDDOT( npw2, spsi(1), 1, g(1), 1 ) + es(2) = 2.D0 * MYDDOT( npw2, spsi(1), 1, ppsi(1), 1 ) !$acc end host_data ! IF ( gstart == 2 ) THEN @@ -270,7 +250,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, & call divide(inter_bgrp_comm,m-1,m_start,m_end); !write(*,*) m-1,m_start,m_end if(m_start.le.m_end) then !$acc host_data use_device(psi, scg) - CALL cgcudaDGEMV( 'T', npw2, m_end-m_start+1, 2.D0, psi(1,m_start), npw2, scg, 1, 0.D0, lagrange_d(m_start), 1 ) + CALL MYDGEMV( 'T', npw2, m_end-m_start+1, 2.D0, psi(1,m_start), npw2, scg, 1, 0.D0, lagrange_d(m_start), 1 ) !$acc end host_data endif if(m_start.le.m_end) lagrange( m_start:m_end ) = lagrange_d( m_start:m_end ) @@ -301,7 +281,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, & ! ... gg1 is (used in Polak-Ribiere formula) ! !$acc host_data use_device(g) - gg1 = 2.D0 * ksDdot( npw2, g(1), 1, g0_d(1), 1 ) + gg1 = 2.D0 * MYDDOT( npw2, g(1), 1, g0_d(1), 1 ) !$acc end host_data IF ( gstart == 2 ) THEN !$acc update self(g) @@ -328,7 +308,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, & !$acc end kernels ! !$acc host_data use_device(g) - gg = 2.D0 * ksDdot( npw2, g(1), 1, g0_d(1), 1 ) + gg = 2.D0 * MYDDOT( npw2, g(1), 1, g0_d(1), 1 ) !$acc end host_data IF ( gstart == 2 ) THEN !$acc update self(g) @@ -390,7 +370,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, & CALL hs_1psi_ptr( npwx, npw, cg(1), ppsi(1), scg(1) ) ! !$acc host_data use_device(cg, scg) - cg0 = 2.D0 * ksDdot( npw2, cg(1), 1, scg(1), 1 ) + cg0 = 2.D0 * MYDDOT( npw2, cg(1), 1, scg(1), 1 ) !$acc end host_data IF ( gstart == 2 ) THEN !$acc update self(cg, scg) @@ -411,7 +391,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, & ! ... = 1 ! !$acc host_data use_device(psi, ppsi) - a0 = 4.D0 * ksDdot( npw2, psi(1,m), 1, ppsi(1), 1 ) + a0 = 4.D0 * MYDDOT( npw2, psi(1,m), 1, ppsi(1), 1 ) !$acc end host_data IF ( gstart == 2 ) THEN !$acc update self(psi, ppsi) @@ -425,7 +405,7 @@ SUBROUTINE rcgdiagg_gpu( hs_1psi_ptr, s_1psi_ptr, precondition, & CALL mp_sum( a0 , intra_bgrp_comm ) ! !$acc host_data use_device(cg, ppsi) - b0 = 2.D0 * ksDdot( npw2, cg(1), 1, ppsi(1), 1 ) + b0 = 2.D0 * MYDDOT( npw2, cg(1), 1, ppsi(1), 1 ) !$acc end host_data IF ( gstart == 2 ) THEN !$acc update self(cg, ppsi) From 4bff5c4cf0662285a4db3049ebf21809ae2ec0e1 Mon Sep 17 00:00:00 2001 From: Ivan Carnimeo Date: Mon, 19 Aug 2024 13:12:56 +0200 Subject: [PATCH 5/7] Remove redundant wrappers to BLAS/LAPACK gpu routines (all calls are redirected to UtilXlib/device_helper.f90) --- KS_Solvers/DENSE/gram_schmidt_gamma_gpu.f90 | 62 ++--- KS_Solvers/DENSE/gram_schmidt_k_gpu.f90 | 54 ++-- KS_Solvers/PPCG/generic_cublas.f90 | 264 -------------------- KS_Solvers/PPCG/ppcg_gamma_gpu.f90 | 110 ++++---- KS_Solvers/PPCG/ppcg_k_gpu.f90 | 76 +++--- KS_Solvers/RMM/crmmdiagg_gpu.f90 | 94 +++---- KS_Solvers/RMM/rrmmdiagg_gpu.f90 | 102 ++++---- UtilXlib/device_helper.f90 | 165 ++++++++++++ 8 files changed, 414 insertions(+), 513 deletions(-) diff --git a/KS_Solvers/DENSE/gram_schmidt_gamma_gpu.f90 b/KS_Solvers/DENSE/gram_schmidt_gamma_gpu.f90 index d1bc0d953..4c73045d4 100644 --- a/KS_Solvers/DENSE/gram_schmidt_gamma_gpu.f90 +++ b/KS_Solvers/DENSE/gram_schmidt_gamma_gpu.f90 @@ -127,7 +127,7 @@ SUBROUTINE gram_schmidt_gamma_gpu( npwx, npw, nbnd, psi_d, hpsi_d, spsi_d, e, & ! ! ... Set initial : |phi_j> = |psi_j> ! - CALL DCOPY_gpu( npwx2 * nbnd, psi_d(1,1), 1, phi_d(1,1), 1 ) + CALL MYDCOPY( npwx2 * nbnd, psi_d(1,1), 1, phi_d(1,1), 1 ) ! ! NOTE: set Im[ phi(G=0) ] - needed for numerical stability ! @@ -141,7 +141,7 @@ SUBROUTINE gram_schmidt_gamma_gpu( npwx, npw, nbnd, psi_d, hpsi_d, spsi_d, e, & ! IF ( eigen_ ) THEN ! - CALL DCOPY_gpu( npwx2 * nbnd, hpsi_d(1,1), 1, hphi_d(1,1), 1 ) + CALL MYDCOPY( npwx2 * nbnd, hpsi_d(1,1), 1, hphi_d(1,1), 1 ) ! ! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability IF ( gstart == 2 ) THEN @@ -155,7 +155,7 @@ SUBROUTINE gram_schmidt_gamma_gpu( npwx, npw, nbnd, psi_d, hpsi_d, spsi_d, e, & ! IF ( uspp ) THEN ! - CALL DCOPY_gpu( npwx2 * nbnd, spsi_d(1,1), 1, sphi_d(1,1), 1 ) + CALL MYDCOPY( npwx2 * nbnd, spsi_d(1,1), 1, sphi_d(1,1), 1 ) ! ! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability IF ( gstart == 2 ) THEN @@ -217,14 +217,14 @@ SUBROUTINE gram_schmidt_gamma_gpu( npwx, npw, nbnd, psi_d, hpsi_d, spsi_d, e, & ! ... Copy psi <- phi ! !CALL DCOPY( npwx2 * nbnd, phi(1,1), 1, psi(1,1), 1 ) - CALL DCOPY_gpu( npwx2 * nbnd, phi_d(1,1), 1, psi_d(1,1), 1 ) + CALL MYDCOPY( npwx2 * nbnd, phi_d(1,1), 1, psi_d(1,1), 1 ) ! IF ( eigen_ ) & - CALL DCOPY_gpu( npwx2 * nbnd, hphi_d(1,1), 1, hpsi_d(1,1), 1 ) + CALL MYDCOPY( npwx2 * nbnd, hphi_d(1,1), 1, hpsi_d(1,1), 1 ) !CALL DCOPY( npwx2 * nbnd, hphi(1,1), 1, hpsi(1,1), 1 ) ! IF ( uspp ) & - CALL DCOPY_gpu( npwx2 * nbnd, sphi_d(1,1), 1, spsi_d(1,1), 1 ) + CALL MYDCOPY( npwx2 * nbnd, sphi_d(1,1), 1, spsi_d(1,1), 1 ) !CALL DCOPY( npwx2 * nbnd, sphi(1,1), 1, spsi(1,1), 1 ) ! ! ... Calculate energy eigenvalues @@ -259,7 +259,7 @@ CONTAINS INTEGER :: ibnd REAL(DP) :: norm REAL(DP) :: psi_ibnd - REAL(DP), EXTERNAL :: gpu_DDOT + REAL(DP), EXTERNAL :: MYDDOT ! DO ibnd = ibnd_start, ibnd_end ! @@ -269,24 +269,24 @@ CONTAINS ! IF ( uspp ) THEN ! - CALL DGEMV_gpu( 'T', npw2, ibnd - ibnd_start, 2._DP, phi_d(1,ibnd_start), npwx2, & + CALL MYDGEMV( 'T', npw2, ibnd - ibnd_start, 2._DP, phi_d(1,ibnd_start), npwx2, & spsi_d(1,ibnd), 1, 0._DP, sr_d(1), 1 ) ! IF ( gstart == 2 ) THEN psi_ibnd = -spsi_d(1,ibnd) - CALL DAXPY_gpu( ibnd - ibnd_start, psi_ibnd , phi_d(1,ibnd_start), npwx2, & + CALL MYDAXPY( ibnd - ibnd_start, psi_ibnd , phi_d(1,ibnd_start), npwx2, & sr_d(1), 1 ) END IF ! ELSE ! - CALL DGEMV_gpu( 'T', npw2, ibnd - ibnd_start, 2._DP, phi_d(1,ibnd_start), npwx2, & + CALL MYDGEMV( 'T', npw2, ibnd - ibnd_start, 2._DP, phi_d(1,ibnd_start), npwx2, & psi_d(1,ibnd), 1, 0._DP, sr_d(1), 1 ) ! IF ( gstart == 2 ) THEN psi_ibnd = -psi_d(1,ibnd) - CALL DAXPY_gpu( ibnd - ibnd_start, psi_ibnd, phi_d(1,ibnd_start), npwx2, & + CALL MYDAXPY( ibnd - ibnd_start, psi_ibnd, phi_d(1,ibnd_start), npwx2, & sr_d(1), 1 ) END IF ! @@ -296,7 +296,7 @@ CONTAINS ! ! ... phi_i = phi_i - phi_j * ! - CALL DGEMV_gpu( 'N', npw2, ibnd - ibnd_start, -1._DP, phi_d(1,ibnd_start), npwx2, & + CALL MYDGEMV( 'N', npw2, ibnd - ibnd_start, -1._DP, phi_d(1,ibnd_start), npwx2, & sr_d(1), 1, 1._DP, phi_d(1,ibnd), 1 ) ! ! NOTE: set Im[ phi(G=0) ] - needed for numerical stability @@ -309,7 +309,7 @@ CONTAINS ! IF ( eigen_ ) THEN ! - CALL DGEMV_gpu( 'N', npw2, ibnd - ibnd_start, -1._DP, hphi_d(1,ibnd_start), npwx2, & + CALL MYDGEMV( 'N', npw2, ibnd - ibnd_start, -1._DP, hphi_d(1,ibnd_start), npwx2, & sr_d(1), 1, 1._DP, hphi_d(1,ibnd), 1 ) ! ! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability @@ -324,7 +324,7 @@ CONTAINS ! IF ( uspp ) THEN ! - CALL DGEMV_gpu( 'N', npw2, ibnd - ibnd_start, -1._DP, sphi_d(1,ibnd_start), npwx2, & + CALL MYDGEMV( 'N', npw2, ibnd - ibnd_start, -1._DP, sphi_d(1,ibnd_start), npwx2, & sr_d(1), 1, 1._DP, sphi_d(1,ibnd), 1 ) ! ! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability @@ -343,7 +343,7 @@ CONTAINS ! IF ( uspp ) THEN ! - norm = 2._DP * gpu_DDOT( npw2, phi_d(1,ibnd), 1, sphi_d(1,ibnd), 1 ) + norm = 2._DP * MYDDOT( npw2, phi_d(1,ibnd), 1, sphi_d(1,ibnd), 1 ) ! IF ( gstart == 2 ) THEN !$cuf kernel do(1) @@ -354,7 +354,7 @@ CONTAINS ! ELSE ! - norm = 2._DP * gpu_DDOT( npw2, phi_d(1,ibnd), 1, phi_d(1,ibnd), 1 ) + norm = 2._DP * MYDDOT( npw2, phi_d(1,ibnd), 1, phi_d(1,ibnd), 1 ) ! IF ( gstart == 2 ) THEN !$cuf kernel do(1) @@ -372,7 +372,7 @@ CONTAINS IF ( norm < eps16 ) & CALL errore( ' gram_schmidt_gamma ', ' vectors are linear dependent ', 1 ) ! - CALL DSCAL_gpu( npw2, 1._DP / norm, phi_d(1,ibnd), 1 ) + CALL MYDSCAL( npw2, 1._DP / norm, phi_d(1,ibnd), 1 ) ! ! NOTE: set Im[ phi(G=0) ] - needed for numerical stability IF ( gstart == 2 ) THEN @@ -384,7 +384,7 @@ CONTAINS ! IF ( eigen_ ) THEN ! - CALL DSCAL_gpu( npw2, 1._DP / norm, hphi_d(1,ibnd), 1 ) + CALL MYDSCAL( npw2, 1._DP / norm, hphi_d(1,ibnd), 1 ) ! ! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability IF ( gstart == 2 ) THEN @@ -398,7 +398,7 @@ CONTAINS ! IF ( uspp ) THEN ! - CALL DSCAL_gpu( npw2, 1._DP / norm, sphi_d(1,ibnd), 1 ) + CALL MYDSCAL( npw2, 1._DP / norm, sphi_d(1,ibnd), 1 ) ! ! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability IF ( gstart == 2 ) THEN @@ -435,20 +435,20 @@ CONTAINS ! IF ( uspp ) THEN ! - CALL gpu_DGEMM( 'T', 'N', ibnd_size, jbnd_size, npw2, 2._DP, phi_d(1,ibnd_start), npwx2, & + CALL MYDGEMM( 'T', 'N', ibnd_size, jbnd_size, npw2, 2._DP, phi_d(1,ibnd_start), npwx2, & spsi_d(1,jbnd_start), npwx2, 0._DP, sr2_d(1,1), nbsize ) ! IF ( gstart == 2 ) & - CALL gpu_DGER( ibnd_size, jbnd_size, -1._DP, psi_d(1,ibnd_start), npwx2, & + CALL MYDGER( ibnd_size, jbnd_size, -1._DP, psi_d(1,ibnd_start), npwx2, & spsi_d(1,jbnd_start), npwx2, sr2_d(1,1), nbsize ) ! ELSE ! - CALL gpu_DGEMM( 'T', 'N', ibnd_size, jbnd_size, npw2, 2._DP, phi_d(1,ibnd_start), npwx2, & + CALL MYDGEMM( 'T', 'N', ibnd_size, jbnd_size, npw2, 2._DP, phi_d(1,ibnd_start), npwx2, & psi_d(1,jbnd_start), npwx2, 0._DP, sr2_d(1,1), nbsize ) ! IF ( gstart == 2 ) & - CALL gpu_DGER( ibnd_size, jbnd_size, -1._DP, psi_d(1,ibnd_start), npwx2, & + CALL MYDGER( ibnd_size, jbnd_size, -1._DP, psi_d(1,ibnd_start), npwx2, & psi_d(1,jbnd_start), npwx2, sr2_d(1,1), nbsize ) ! END IF @@ -457,7 +457,7 @@ CONTAINS ! ! ... phi_j = phi_j - phi_i * ! - CALL gpu_DGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, phi_d(1,ibnd_start), npwx2, & + CALL MYDGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, phi_d(1,ibnd_start), npwx2, & sr2_d(1,1), nbsize, 1._DP, phi_d(1,jbnd_start), npwx2 ) ! ! NOTE: set Im[ phi(G=0) ] - needed for numerical stability @@ -471,7 +471,7 @@ CONTAINS ! IF ( eigen_ ) THEN ! - CALL gpu_DGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, hphi_d(1,ibnd_start), npwx2, & + CALL MYDGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, hphi_d(1,ibnd_start), npwx2, & sr2_d(1,1), nbsize, 1._DP, hphi_d(1,jbnd_start), npwx2 ) ! ! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability @@ -487,7 +487,7 @@ CONTAINS ! IF ( uspp ) THEN ! - CALL gpu_DGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, sphi_d(1,ibnd_start), npwx2, & + CALL MYDGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, sphi_d(1,ibnd_start), npwx2, & sr2_d(1,1), nbsize, 1._DP, sphi_d(1,jbnd_start), npwx2 ) ! ! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability @@ -513,7 +513,7 @@ CONTAINS INTEGER :: ibnd, ibnd_start, ibnd_end REAL(DP) :: e_ibnd ! - REAL(DP), EXTERNAL :: gpu_DDOT + REAL(DP), EXTERNAL :: MYDDOT ! ! ... ! @@ -523,7 +523,7 @@ CONTAINS ! DO ibnd = ibnd_start, ibnd_end ! - e(ibnd) = 2._DP * gpu_DDOT( npw2, psi_d(1,ibnd), 1, hpsi_d(1,ibnd), 1 ) + e(ibnd) = 2._DP * MYDDOT( npw2, psi_d(1,ibnd), 1, hpsi_d(1,ibnd), 1 ) ! IF ( gstart == 2 ) THEN !$cuf kernel do(1) @@ -563,13 +563,13 @@ CONTAINS e(ibnd) = e(ibnd-1) e(ibnd-1) = e0 ! - CALL DSWAP_gpu( npw2, psi_d(1,ibnd), 1, psi_d(1,ibnd-1), 1 ) + CALL MYDSWAP( npw2, psi_d(1,ibnd), 1, psi_d(1,ibnd-1), 1 ) ! IF ( eigen_ ) & - CALL DSWAP_gpu( npw2, hpsi_d(1,ibnd), 1, hpsi_d(1,ibnd-1), 1 ) + CALL MYDSWAP( npw2, hpsi_d(1,ibnd), 1, hpsi_d(1,ibnd-1), 1 ) ! IF ( uspp ) & - CALL DSWAP_gpu( npw2, spsi_d(1,ibnd), 1, spsi_d(1,ibnd-1), 1 ) + CALL MYDSWAP( npw2, spsi_d(1,ibnd), 1, spsi_d(1,ibnd-1), 1 ) ! END IF ! diff --git a/KS_Solvers/DENSE/gram_schmidt_k_gpu.f90 b/KS_Solvers/DENSE/gram_schmidt_k_gpu.f90 index 10f0b2b17..df985bcf3 100644 --- a/KS_Solvers/DENSE/gram_schmidt_k_gpu.f90 +++ b/KS_Solvers/DENSE/gram_schmidt_k_gpu.f90 @@ -142,13 +142,13 @@ SUBROUTINE gram_schmidt_k_gpu( npwx, npw, nbnd, npol, psi_d, hpsi_d, spsi_d, e, ! ! ... Set initial : |phi_j> = |psi_j> ! - CALL ZCOPY_gpu( kdmx * nbnd, psi_d(1,1), 1, phi_d(1,1), 1 ) + CALL MYZCOPY( kdmx * nbnd, psi_d(1,1), 1, phi_d(1,1), 1 ) ! IF ( eigen_ ) & - CALL ZCOPY_gpu( kdmx * nbnd, hpsi_d(1,1), 1, hphi_d(1,1), 1 ) + CALL MYZCOPY( kdmx * nbnd, hpsi_d(1,1), 1, hphi_d(1,1), 1 ) ! IF ( uspp ) & - CALL ZCOPY_gpu( kdmx * nbnd, spsi_d(1,1), 1, sphi_d(1,1), 1 ) + CALL MYZCOPY( kdmx * nbnd, spsi_d(1,1), 1, sphi_d(1,1), 1 ) ! ! ! ... Allocate buffers @@ -204,13 +204,13 @@ SUBROUTINE gram_schmidt_k_gpu( npwx, npw, nbnd, npol, psi_d, hpsi_d, spsi_d, e, ! ! ... Copy psi <- phi ! - CALL ZCOPY_gpu( kdmx * nbnd, phi_d(1,1), 1, psi_d(1,1), 1 ) + CALL MYZCOPY( kdmx * nbnd, phi_d(1,1), 1, psi_d(1,1), 1 ) ! IF ( eigen_ ) & - CALL ZCOPY_gpu( kdmx * nbnd, hphi_d(1,1), 1, hpsi_d(1,1), 1 ) + CALL MYZCOPY( kdmx * nbnd, hphi_d(1,1), 1, hpsi_d(1,1), 1 ) ! IF ( uspp ) & - CALL ZCOPY_gpu( kdmx * nbnd, sphi_d(1,1), 1, spsi_d(1,1), 1 ) + CALL MYZCOPY( kdmx * nbnd, sphi_d(1,1), 1, spsi_d(1,1), 1 ) ! ! ... Calculate energy eigenvalues ! @@ -240,7 +240,7 @@ CONTAINS ! INTEGER :: ibnd REAL(DP) :: norm - COMPLEX(DP), EXTERNAL :: ZDOTC_gpu + COMPLEX(DP), EXTERNAL :: MYZDOTC ! ! DO ibnd = ibnd_start, ibnd_end @@ -251,12 +251,12 @@ CONTAINS ! IF ( uspp ) THEN ! - CALL ZGEMV_gpu( 'C', kdim, ibnd - ibnd_start, ONE, phi_d(1,ibnd_start), kdmx, & + CALL MYZGEMV( 'C', kdim, ibnd - ibnd_start, ONE, phi_d(1,ibnd_start), kdmx, & spsi_d(1,ibnd), 1, ZERO, sc_d(1), 1 ) ! ELSE ! - CALL ZGEMV_gpu( 'C', kdim, ibnd - ibnd_start, ONE, phi_d(1,ibnd_start), kdmx, & + CALL MYZGEMV( 'C', kdim, ibnd - ibnd_start, ONE, phi_d(1,ibnd_start), kdmx, & psi_d(1,ibnd), 1, ZERO, sc_d(1), 1 ) ! END IF @@ -266,16 +266,16 @@ CONTAINS ! ! ... phi_i = phi_i - phi_j * ! - CALL ZGEMV_gpu( 'N', kdim, ibnd - ibnd_start, MONE, phi_d(1,ibnd_start), kdmx, & + CALL MYZGEMV( 'N', kdim, ibnd - ibnd_start, MONE, phi_d(1,ibnd_start), kdmx, & sc_d(1), 1, ONE, phi_d(1,ibnd), 1 ) ! ! IF ( eigen_ ) & - CALL ZGEMV_gpu( 'N', kdim, ibnd - ibnd_start, MONE, hphi_d(1,ibnd_start), kdmx, & + CALL MYZGEMV( 'N', kdim, ibnd - ibnd_start, MONE, hphi_d(1,ibnd_start), kdmx, & sc_d(1), 1, ONE, hphi_d(1,ibnd), 1 ) ! IF ( uspp ) & - CALL ZGEMV_gpu( 'N', kdim, ibnd - ibnd_start, MONE, sphi_d(1,ibnd_start), kdmx, & + CALL MYZGEMV( 'N', kdim, ibnd - ibnd_start, MONE, sphi_d(1,ibnd_start), kdmx, & sc_d(1), 1, ONE, sphi_d(1,ibnd), 1 ) ! END IF @@ -284,11 +284,11 @@ CONTAINS ! IF ( uspp ) THEN ! - norm = DBLE( ZDOTC_gpu( kdim, phi_d(1,ibnd), 1, sphi_d(1,ibnd), 1 ) ) + norm = DBLE( MYZDOTC( kdim, phi_d(1,ibnd), 1, sphi_d(1,ibnd), 1 ) ) ! ELSE ! - norm = DBLE( ZDOTC_gpu( kdim, phi_d(1,ibnd), 1, phi_d(1,ibnd), 1 ) ) + norm = DBLE( MYZDOTC( kdim, phi_d(1,ibnd), 1, phi_d(1,ibnd), 1 ) ) ! END IF ! @@ -299,13 +299,13 @@ CONTAINS IF ( norm < eps16 ) & CALL errore( ' gram_schmidt_k ', ' vectors are linear dependent ', 1 ) ! - CALL ZDSCAL_gpu( kdim, 1._DP / norm, phi_d(1,ibnd), 1 ) + CALL MYZDSCAL( kdim, 1._DP / norm, phi_d(1,ibnd), 1 ) ! IF ( eigen_ ) & - CALL ZDSCAL_gpu( kdim, 1._DP / norm, hphi_d(1,ibnd), 1 ) + CALL MYZDSCAL( kdim, 1._DP / norm, hphi_d(1,ibnd), 1 ) ! IF ( uspp ) & - CALL ZDSCAL_gpu( kdim, 1._DP / norm, sphi_d(1,ibnd), 1 ) + CALL MYZDSCAL( kdim, 1._DP / norm, sphi_d(1,ibnd), 1 ) ! END DO ! @@ -332,12 +332,12 @@ CONTAINS ! IF ( uspp ) THEN ! - CALL gpu_ZGEMM( 'C', 'N', ibnd_size, jbnd_size, kdim, ONE, phi_d(1,ibnd_start), kdmx, & + CALL MYZGEMM( 'C', 'N', ibnd_size, jbnd_size, kdim, ONE, phi_d(1,ibnd_start), kdmx, & spsi_d(1,jbnd_start), kdmx, ZERO, sc2_d(1,1), nbsize ) ! ELSE ! - CALL gpu_ZGEMM( 'C', 'N', ibnd_size, jbnd_size, kdim, ONE, phi_d(1,ibnd_start), kdmx, & + CALL MYZGEMM( 'C', 'N', ibnd_size, jbnd_size, kdim, ONE, phi_d(1,ibnd_start), kdmx, & psi_d(1,jbnd_start), kdmx, ZERO, sc2_d(1,1), nbsize ) ! END IF @@ -346,15 +346,15 @@ CONTAINS ! ! ... phi_j = phi_j - phi_i * ! - CALL gpu_ZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, phi_d(1,ibnd_start), kdmx, & + CALL MYZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, phi_d(1,ibnd_start), kdmx, & sc2_d(1,1), nbsize, ONE, phi_d(1,jbnd_start), kdmx ) ! IF ( eigen_ ) & - CALL gpu_ZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, hphi_d(1,ibnd_start), kdmx, & + CALL MYZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, hphi_d(1,ibnd_start), kdmx, & sc2_d(1,1), nbsize, ONE, hphi_d(1,jbnd_start), kdmx ) ! IF ( uspp ) & - CALL gpu_ZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, sphi_d(1,ibnd_start), kdmx, & + CALL MYZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, sphi_d(1,ibnd_start), kdmx, & sc2_d(1,1), nbsize, ONE, sphi_d(1,jbnd_start), kdmx ) ! RETURN @@ -368,7 +368,7 @@ CONTAINS ! INTEGER :: ibnd, ibnd_start, ibnd_end ! - COMPLEX(DP), EXTERNAL :: ZDOTC_gpu + COMPLEX(DP), EXTERNAL :: MYZDOTC ! ! ... ! @@ -378,7 +378,7 @@ CONTAINS ! DO ibnd = ibnd_start, ibnd_end ! - e(ibnd) = DBLE( ZDOTC_gpu( kdim, psi_d(1,ibnd), 1, hpsi_d(1,ibnd), 1 ) ) + e(ibnd) = DBLE( MYZDOTC( kdim, psi_d(1,ibnd), 1, hpsi_d(1,ibnd), 1 ) ) ! END DO ! @@ -410,13 +410,13 @@ CONTAINS e(ibnd) = e(ibnd-1) e(ibnd-1) = e0 ! - CALL ZSWAP_gpu( kdim, psi_d(1,ibnd), 1, psi_d(1,ibnd-1), 1 ) + CALL MYZSWAP( kdim, psi_d(1,ibnd), 1, psi_d(1,ibnd-1), 1 ) ! IF ( eigen_ ) & - CALL ZSWAP_gpu( kdim, hpsi_d(1,ibnd), 1, hpsi_d(1,ibnd-1), 1 ) + CALL MYZSWAP( kdim, hpsi_d(1,ibnd), 1, hpsi_d(1,ibnd-1), 1 ) ! IF ( uspp ) & - CALL ZSWAP_gpu( kdim, spsi_d(1,ibnd), 1, spsi_d(1,ibnd-1), 1 ) + CALL MYZSWAP( kdim, spsi_d(1,ibnd), 1, spsi_d(1,ibnd-1), 1 ) ! END IF ! diff --git a/KS_Solvers/PPCG/generic_cublas.f90 b/KS_Solvers/PPCG/generic_cublas.f90 index 8ee2bc811..394e070bb 100644 --- a/KS_Solvers/PPCG/generic_cublas.f90 +++ b/KS_Solvers/PPCG/generic_cublas.f90 @@ -1,252 +1,3 @@ -subroutine gpu_DGEMM (transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) -#if defined(__CUDA) -USE cublas -#endif -implicit none - character*1 transa, transb - integer :: m, n, k, lda, ldb, ldc - DOUBLE PRECISION :: alpha, beta - DOUBLE PRECISION, dimension(lda, *) :: A - DOUBLE PRECISION, dimension(ldb, *) :: B - DOUBLE PRECISION, dimension(ldc, *) :: C -#if defined(__CUDA) - attributes(device) :: A, B, C - call cublasDGEMM(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) -#endif - return -end subroutine gpu_DGEMM -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -subroutine gpu_ZGEMM (transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) -#if defined(__CUDA) -USE cublas -#endif -implicit none - character*1 transa, transb - integer :: m, n, k, lda, ldb, ldc - DOUBLE COMPLEX :: alpha, beta - DOUBLE COMPLEX, dimension(lda, *) :: A - DOUBLE COMPLEX, dimension(ldb, *) :: B - DOUBLE COMPLEX, dimension(ldc, *) :: C -#if defined(__CUDA) - attributes(device) :: A, B, C - call cublasZGEMM(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) -#endif - return -end subroutine gpu_ZGEMM -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -subroutine gpu_DGER (m, n, alpha, x, incx, y, incy, a, lda) -#if defined(__CUDA) -USE cublas -#endif -implicit none - integer :: m, n, lda, incx, incy - DOUBLE PRECISION :: alpha - DOUBLE PRECISION, dimension(lda, *) :: A - DOUBLE PRECISION, dimension(*) :: x, y -#if defined(__CUDA) - attributes(device) :: A, x, y - call cublasDGER(m, n, alpha, x, incx, y, incy, a, lda) -#endif - return -end subroutine gpu_DGER -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -function gpu_DDOT (n, dx, incx, dy, incy) -#if defined(__CUDA) -USE cublas -#endif -implicit none - DOUBLE PRECISION :: gpu_DDOT - integer :: n, incx, incy - DOUBLE PRECISION, dimension(*) :: dx, dy -#if defined(__CUDA) - attributes(device) :: dx, dy - gpu_DDOT=cublasDDOT(n, dx, incx, dy, incy) -#endif - return -end function gpu_DDOT -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -subroutine gpu_DTRSM(side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb) -#if defined(__CUDA) -USE cublas -#endif -implicit none - character*1 :: side, uplo, transa, diag - integer :: m, n, lda, ldb - DOUBLE PRECISION :: alpha - DOUBLE PRECISION, dimension(lda, *) :: a - DOUBLE PRECISION, dimension(ldb, *) :: b -#if defined(__CUDA) - attributes(device) :: a, b - call cublasDTRSM(side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb) -#endif - return -end subroutine gpu_DTRSM -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -DOUBLE COMPLEX function ZDOTC_gpu(n, zx, incx, zy, incy) -#if defined(__CUDA) -USE cublas -#endif -implicit none - integer :: n, incx, incy - DOUBLE COMPLEX, dimension(*) :: zx, zy -#if defined(__CUDA) - attributes(device) :: zx, zy - ZDOTC_gpu = cublasZDOTC(n, zx, incx, zy, incy) -#endif - return -end function ZDOTC_gpu -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -SUBROUTINE ZSWAP_gpu(n, zx, incx, zy, incy) -#if defined(__CUDA) -USE cublas -#endif -implicit none - integer :: n, incx, incy - DOUBLE COMPLEX, dimension(*) :: zx, zy -#if defined(__CUDA) - attributes(device) :: zx, zy - CALL cublasZSWAP(n, zx, incx, zy, incy) -#endif - return -END SUBROUTINE ZSWAP_gpu -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -SUBROUTINE ZCOPY_gpu(n, zx, incx, zy, incy) -#if defined(__CUDA) -USE cublas -#endif -IMPLICIT NONE - INTEGER :: n, incx, incy - DOUBLE COMPLEX, dimension(*) :: zx, zy -#if defined(__CUDA) - attributes(device) :: zx, zy - CALL cublasZCOPY(n, zx, incx, zy, incy) -#endif - RETURN -END SUBROUTINE ZCOPY_gpu -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -SUBROUTINE ZAXPY_gpu(n, za, zx, incx, zy, incy) -#if defined(__CUDA) -USE cublas -#endif -IMPLICIT NONE - INTEGER :: n, incx, incy - DOUBLE COMPLEX :: za - DOUBLE COMPLEX, dimension(*) :: zx, zy -#if defined(__CUDA) - attributes(device) :: zx, zy - CALL cublasZAXPY(n, za, zx, incx, zy, incy) -#endif - RETURN -END SUBROUTINE ZAXPY_gpu -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -SUBROUTINE ZGEMV_gpu(trans, m, n, alpha, a, lda, x, incx, beta, y, incy) -#if defined(__CUDA) -USE cublas -#endif -IMPLICIT NONE - CHARACTER :: trans - INTEGER :: lda, m, n, incx, incy - DOUBLE COMPLEX :: alpha, beta - DOUBLE COMPLEX, dimension(lda, *) :: a - DOUBLE COMPLEX, dimension(*) :: x, y -#if defined(__CUDA) - attributes(device) :: a, x, y - CALL cublasZGEMV(trans, m, n, alpha, a, lda, x, incx, beta, y, incy) -#endif - RETURN -END SUBROUTINE ZGEMV_gpu -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -SUBROUTINE ZDSCAL_gpu(n, da, zx, incx) -#if defined(__CUDA) -USE cublas -#endif -IMPLICIT NONE - INTEGER :: n, incx - DOUBLE PRECISION :: da - DOUBLE COMPLEX, dimension(*) :: zx -#if defined(__CUDA) - attributes(device) :: zx - CALL cublasZDSCAL(n, da, zx, incx) -#endif - RETURN -END SUBROUTINE ZDSCAL_gpu -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -SUBROUTINE ZSCAL_gpu(n, za, zx, incx) -#if defined(__CUDA) -USE cublas -#endif -IMPLICIT NONE - INTEGER :: n, incx - DOUBLE COMPLEX :: za - DOUBLE COMPLEX, dimension(*) :: zx -#if defined(__CUDA) - attributes(device) :: zx - CALL cublasZSCAL(n, za, zx, incx) -#endif - RETURN -END SUBROUTINE ZSCAL_gpu -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -SUBROUTINE DGEMV_gpu(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) -#if defined(__CUDA) -use cublas -#endif -IMPLICIT NONE - DOUBLE PRECISION :: ALPHA,BETA - INTEGER :: INCX,INCY,LDA,M,N - CHARACTER :: TRANS - DOUBLE PRECISION :: A(LDA,*),X(*),Y(*) -#if defined(__CUDA) - attributes(device) :: A, X, Y - call cublasDGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) -#endif - RETURN -END SUBROUTINE DGEMV_gpu -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -SUBROUTINE DCOPY_gpu(n, x, incx, y, incy) -#if defined(__CUDA) -USE cublas -#endif -IMPLICIT NONE - INTEGER :: n, incx, incy - DOUBLE PRECISION, INTENT(IN) :: x(*) - DOUBLE PRECISION, INTENT(OUT) :: y(*) -#if defined(__CUDA) - attributes(device) :: x, y - call cublasDCOPY(n, x, incx, y, incy) -#endif - RETURN -END SUBROUTINE DCOPY_gpu -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -SUBROUTINE DAXPY_gpu(n, a, x, incx, y, incy) -#if defined(__CUDA) -USE cublas -#endif -IMPLICIT NONE - INTEGER :: n, incx, incy - DOUBLE PRECISION, INTENT(IN) :: a - DOUBLE PRECISION, INTENT(IN) :: x(*) - DOUBLE PRECISION, INTENT(OUT) :: y(*) -#if defined(__CUDA) - attributes(device) :: x, y - call cublasDAXPY( n, a, x, incx, y, incy) -#endif - RETURN -END SUBROUTINE DAXPY_gpu -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -SUBROUTINE DSCAL_gpu(n, a, x, incx) -#if defined(__CUDA) -USE cublas -#endif -IMPLICIT NONE - integer :: n, incx - DOUBLE PRECISION :: a - DOUBLE PRECISION, dimension(*) :: x -#if defined(__CUDA) - attributes(device) :: x - call cublasDSCAL(n, a, x, incx) -#endif - RETURN -END SUBROUTINE DSCAL_gpu !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! SUBROUTINE gpu_threaded_memset(array, val, length) ! @@ -384,18 +135,3 @@ SUBROUTINE gpu_threaded_backassign(array_out, idx, array_in, kdimx, nact, use_a2 END IF ! END SUBROUTINE gpu_threaded_backassign -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -SUBROUTINE DSWAP_gpu(n, dx, incx, dy, incy) -#if defined(__CUDA) -USE cublas -#endif -implicit none - integer :: n, incx, incy - REAL(8), dimension(*) :: dx, dy -#if defined(__CUDA) - attributes(device) :: dx, dy - CALL cublasDSWAP(n, dx, incx, dy, incy) -#endif - return -END SUBROUTINE DSWAP_gpu -!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! diff --git a/KS_Solvers/PPCG/ppcg_gamma_gpu.f90 b/KS_Solvers/PPCG/ppcg_gamma_gpu.f90 index 3b484be49..2c1ded7a2 100644 --- a/KS_Solvers/PPCG/ppcg_gamma_gpu.f90 +++ b/KS_Solvers/PPCG/ppcg_gamma_gpu.f90 @@ -191,8 +191,8 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & call gpu_threaded_memset( G_d, ZERO, nbnd*nbnd ) ! G = ZERO CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end if (n_start .le. n_end) & - CALL gpu_DGEMM('T','N', nbnd, my_n, npw2, 2.D0, psi_d, npwx2, hpsi_d(1,n_start), npwx2, 0.D0, G_d(1,n_start), nbnd) - IF ( gstart == 2 ) CALL gpu_DGER( nbnd, my_n, -1.D0, psi_d, npwx2, hpsi_d(1,n_start), npwx2, G_d(1,n_start), nbnd ) + CALL MYDGEMM('T','N', nbnd, my_n, npw2, 2.D0, psi_d, npwx2, hpsi_d(1,n_start), npwx2, 0.D0, G_d(1,n_start), nbnd) + IF ( gstart == 2 ) CALL MYDGER( nbnd, my_n, -1.D0, psi_d, npwx2, hpsi_d(1,n_start), npwx2, G_d(1,n_start), nbnd ) CALL mp_sum( G_d, inter_bgrp_comm ) CALL mp_sum( G_d, intra_bgrp_comm ) call stop_clock('ppcg:dgemm') @@ -203,10 +203,10 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end if (overlap) then if (n_start .le. n_end) & - CALL gpu_DGEMM('N','N',npw2, nbnd, my_n, -ONE, spsi_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, w_d, npwx2) + CALL MYDGEMM('N','N',npw2, nbnd, my_n, -ONE, spsi_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, w_d, npwx2) else if (n_start .le. n_end) & - CALL gpu_DGEMM('N','N',npw2, nbnd, my_n, -ONE, psi_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, w_d, npwx2) + CALL MYDGEMM('N','N',npw2, nbnd, my_n, -ONE, psi_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, w_d, npwx2) end if CALL mp_sum( w_d, inter_bgrp_comm ) call stop_clock('ppcg:dgemm') @@ -260,12 +260,12 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end if (overlap) then if (n_start .le. n_end) & - CALL gpu_DGEMM( 'T','N', my_n, nact, npw2, 2.D0, spsi_d(1,n_start), npwx2, buffer_d, npwx2, 0.D0, G_d(n_start,1), nbnd ) - IF ( gstart == 2 ) CALL gpu_DGER( my_n, nact, -1.D0, spsi_d(1,n_start), npwx2, buffer_d, npwx2, G_d(n_start,1), nbnd ) + CALL MYDGEMM( 'T','N', my_n, nact, npw2, 2.D0, spsi_d(1,n_start), npwx2, buffer_d, npwx2, 0.D0, G_d(n_start,1), nbnd ) + IF ( gstart == 2 ) CALL MYDGER( my_n, nact, -1.D0, spsi_d(1,n_start), npwx2, buffer_d, npwx2, G_d(n_start,1), nbnd ) else if (n_start .le. n_end) & - CALL gpu_DGEMM( 'T','N', my_n, nact, npw2, 2.D0, psi_d(1,n_start), npwx2, buffer_d, npwx2, 0.D0, G_d(n_start,1), nbnd ) - IF ( gstart == 2 ) CALL gpu_DGER( my_n, nact, -1.D0, psi_d(1,n_start), npwx2, buffer_d, npwx2, G_d(n_start,1), nbnd ) + CALL MYDGEMM( 'T','N', my_n, nact, npw2, 2.D0, psi_d(1,n_start), npwx2, buffer_d, npwx2, 0.D0, G_d(n_start,1), nbnd ) + IF ( gstart == 2 ) CALL MYDGER( my_n, nact, -1.D0, psi_d(1,n_start), npwx2, buffer_d, npwx2, G_d(n_start,1), nbnd ) end if G = G_d CALL mp_sum( G(1:nbnd,1:nact), inter_bgrp_comm ) @@ -277,7 +277,7 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & call start_clock('ppcg:dgemm') call gpu_threaded_assign( buffer_d, w_d, npwx, nact, .true., act_idx_d, .true. ) if (n_start .le. n_end) & - CALL gpu_DGEMM('N','N', npw2, nact, my_n, -ONE, psi_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2) + CALL MYDGEMM('N','N', npw2, nact, my_n, -ONE, psi_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2) buffer = buffer_d CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm ) buffer_d = buffer @@ -343,8 +343,8 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & call gpu_threaded_assign( buffer1_d, p_d, npwx, nact, .true., act_idx_d, .false. ) CALL divide(inter_bgrp_comm,nact,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nact,n_start,n_end if (n_start .le. n_end) & - CALL gpu_DGEMM('T','N', my_n, nact, npw2, 2.D0, buffer_d(1,n_start), npwx2, buffer1_d, npwx2, 0.D0, G_d(n_start,1), nbnd) - IF ( gstart == 2 ) CALL gpu_DGER( my_n, nact, -1.D0, buffer_d(1,n_start), npwx2, buffer1_d, npwx2, G_d(n_start,1), nbnd ) + CALL MYDGEMM('T','N', my_n, nact, npw2, 2.D0, buffer_d(1,n_start), npwx2, buffer1_d, npwx2, 0.D0, G_d(n_start,1), nbnd) + IF ( gstart == 2 ) CALL MYDGER( my_n, nact, -1.D0, buffer_d(1,n_start), npwx2, buffer1_d, npwx2, G_d(n_start,1), nbnd ) G = G_d CALL mp_sum( G(1:nact,1:nact), inter_bgrp_comm ) CALL mp_sum( G(1:nact,1:nact), intra_bgrp_comm ) @@ -356,7 +356,7 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & call gpu_threaded_assign( buffer_d, p_d, npwx, nact, .true., act_idx_d, .true. ) call gpu_threaded_assign( buffer1_d, psi_d, npwx, nact, .true., act_idx_d, .false. ) if (n_start .le. n_end) & ! could be done differently - CALL gpu_DGEMM('N','N', npw2, nact, my_n,-ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2) + CALL MYDGEMM('N','N', npw2, nact, my_n,-ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2) buffer = buffer_d CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm ) buffer_d = buffer @@ -368,7 +368,7 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & call gpu_threaded_assign( buffer_d, hp_d, npwx, nact, .true., act_idx_d, .true. ) call gpu_threaded_assign( buffer1_d, hpsi_d, npwx, nact, .true., act_idx_d, .false. ) if (n_start .le. n_end) & - CALL gpu_DGEMM('N','N', npw2, nact, my_n,-ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2) + CALL MYDGEMM('N','N', npw2, nact, my_n,-ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2) buffer = buffer_d CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm ) buffer_d = buffer @@ -381,7 +381,7 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & call gpu_threaded_assign( buffer_d, sp_d, npwx, nact, .true., act_idx_d, .true. ) call gpu_threaded_assign( buffer1_d, spsi_d, npwx, nact, .true., act_idx_d, .false. ) if (n_start .le. n_end) & - CALL gpu_DGEMM('N','N', npw2, nact, my_n,-ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2) + CALL MYDGEMM('N','N', npw2, nact, my_n,-ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2) buffer = buffer_d CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm ) buffer_d = buffer @@ -416,44 +416,44 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & call start_clock('ppcg:dgemm') call gpu_threaded_assign( buffer_d, psi_d, npwx, l, .true., col_idx_d, .false. ) call gpu_threaded_assign( buffer1_d, hpsi_d, npwx, l, .true., col_idx_d, .false. ) - CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d, sbsize3) - IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d, sbsize3 ) + CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d, sbsize3) + IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d, sbsize3 ) ! if (overlap) then call gpu_threaded_assign( buffer1_d, spsi_d, npwx, l, .true., col_idx_d, .false.) else call gpu_threaded_assign( buffer1_d, buffer_d, npwx, l, .false., col_idx_d, .false. ) end if - CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d, sbsize3) - IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d, sbsize3 ) + CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d, sbsize3) + IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d, sbsize3 ) ! ! --- call gpu_threaded_assign( buffer_d, w_d, npwx, l, .true., col_idx_d, .false. ) call gpu_threaded_assign( buffer1_d, hw_d, npwx, l, .true., col_idx_d, .false. ) - CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(l+1, l+1), sbsize3) - IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(l+1, l+1), sbsize3 ) + CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(l+1, l+1), sbsize3) + IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(l+1, l+1), sbsize3 ) ! if (overlap) then call gpu_threaded_assign( buffer1_d, sw_d, npwx, l, .true., col_idx_d, .false. ) else call gpu_threaded_assign( buffer1_d, buffer_d, npwx, l, .false., col_idx_d, .false. ) end if - CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(l+1, l+1 ), sbsize3) - IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(l+1, l+1), sbsize3 ) + CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(l+1, l+1 ), sbsize3) + IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(l+1, l+1), sbsize3 ) ! ! --- call gpu_threaded_assign( buffer_d, psi_d, npwx, l, .true., col_idx_d, .false. ) call gpu_threaded_assign( buffer1_d, hw_d, npwx, l, .true., col_idx_d, .false. ) - CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(1, l+1), sbsize3) - IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(1, l+1), sbsize3 ) + CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(1, l+1), sbsize3) + IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(1, l+1), sbsize3 ) ! if (overlap) then call gpu_threaded_assign( buffer1_d, sw_d, npwx, l, .true., col_idx_d, .false. ) else call gpu_threaded_assign( buffer1_d, w_d, npwx, l, .true., col_idx_d, .false. ) end if - CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(1, l+1), sbsize3) - IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(1, l+1), sbsize3 ) + CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(1, l+1), sbsize3) + IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(1, l+1), sbsize3 ) call stop_clock('ppcg:dgemm') ! ! --- @@ -463,30 +463,30 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & call start_clock('ppcg:dgemm') call gpu_threaded_assign( buffer_d, p_d, npwx, l, .true., col_idx_d, .false. ) call gpu_threaded_assign( buffer1_d, hp_d, npwx, l, .true., col_idx_d, .false. ) - CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(2*l + 1, 2*l+1), sbsize3) - IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(2*l + 1, 2*l+1 ), sbsize3 ) + CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(2*l + 1, 2*l+1), sbsize3) + IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(2*l + 1, 2*l+1 ), sbsize3 ) ! if (overlap) then call gpu_threaded_assign( buffer1_d, sp_d, npwx, l, .true., col_idx_d, .false. ) else call gpu_threaded_assign( buffer1_d, buffer_d, npwx, l, .false., col_idx_d, .false. ) end if - CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(2*l + 1, 2*l+1), sbsize3) - IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(2*l + 1, 2*l+1 ), sbsize3) + CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(2*l + 1, 2*l+1), sbsize3) + IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(2*l + 1, 2*l+1 ), sbsize3) ! ! --- call gpu_threaded_assign( buffer_d, psi_d, npwx, l, .true., col_idx_d, .false. ) call gpu_threaded_assign( buffer1_d, hp_d, npwx, l, .true., col_idx_d, .false. ) - CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(1, 2*l+1), sbsize3) - IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(1, 2*l+1), sbsize3) + CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(1, 2*l+1), sbsize3) + IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(1, 2*l+1), sbsize3) ! if (overlap) then call gpu_threaded_assign( buffer1_d, sp_d, npwx, l, .true., col_idx_d, .false. ) else call gpu_threaded_assign( buffer1_d, p_d, npwx, l, .true., col_idx_d, .false. ) end if - CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(1, 2*l+1), sbsize3) - IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(1, 2*l+1), sbsize3) + CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(1, 2*l+1), sbsize3) + IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(1, 2*l+1), sbsize3) call stop_clock('ppcg:dgemm') ! ! --- @@ -494,16 +494,16 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & call start_clock('ppcg:dgemm') call gpu_threaded_assign( buffer_d, w_d, npwx, l, .true., col_idx_d, .false. ) call gpu_threaded_assign( buffer1_d, hp_d, npwx, l, .true., col_idx_d, .false. ) - CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(l+1, 2*l+1), sbsize3) - IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(l+1, 2*l+1), sbsize3) + CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, K_d(l+1, 2*l+1), sbsize3) + IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, K_d(l+1, 2*l+1), sbsize3) ! if (overlap) then call gpu_threaded_assign( buffer1_d, sp_d, npwx, l, .true., col_idx_d, .false. ) else call gpu_threaded_assign( buffer1_d, p_d, npwx, l, .true., col_idx_d, .false. ) end if - CALL gpu_DGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(l+1, 2*l+1), sbsize3) - IF ( gstart == 2 ) CALL gpu_DGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(l+1, 2*l+1), sbsize3) + CALL MYDGEMM('T','N', l, l, npw2, 2.D0, buffer_d, npwx2, buffer1_d, npwx2, 0.D0, M_d(l+1, 2*l+1), sbsize3) + IF ( gstart == 2 ) CALL MYDGER( l, l, -1.D0, buffer_d, npwx2, buffer1_d, npwx2, M_d(l+1, 2*l+1), sbsize3) call stop_clock('ppcg:dgemm') ! END IF @@ -594,18 +594,18 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & ! call start_clock('ppcg:dgemm') call gpu_threaded_assign( buffer1_d, p_d, npwx, l, .true., col_idx_d, .false. ) - CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_p_d, sbsize, ZERO, buffer_d, npwx2) + CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_p_d, sbsize, ZERO, buffer_d, npwx2) call gpu_threaded_assign( buffer1_d, w_d, npwx, l, .true., col_idx_d, .false. ) - CALL gpu_DGEMM('N','N', npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ONE, buffer_d, npwx2) + CALL MYDGEMM('N','N', npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ONE, buffer_d, npwx2) ! p(:,col_idx(1:l)) = buffer(:,1:l) call gpu_threaded_backassign( p_d, col_idx_d, buffer_d, npwx, l, .false., p_d ) call stop_clock('ppcg:dgemm') ! call start_clock('ppcg:dgemm') call gpu_threaded_assign( buffer1_d, hp_d, npwx, l, .true., col_idx_d, .false. ) - CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_p_d, sbsize, ZERO, buffer_d, npwx2) + CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_p_d, sbsize, ZERO, buffer_d, npwx2) call gpu_threaded_assign( buffer1_d, hw_d, npwx, l, .true., col_idx_d, .false. ) - CALL gpu_DGEMM('N','N', npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ONE, buffer_d, npwx2) + CALL MYDGEMM('N','N', npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ONE, buffer_d, npwx2) ! hp(:,col_idx(1:l)) = buffer(:,1:l) call gpu_threaded_backassign( hp_d, col_idx_d, buffer_d, npwx, l, .false., hp_d ) call stop_clock('ppcg:dgemm') @@ -613,9 +613,9 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & if (overlap) then call start_clock('ppcg:dgemm') call gpu_threaded_assign( buffer1_d, sp_d, npwx, l, .true., col_idx_d, .false. ) - CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_p_d, sbsize, ZERO, buffer_d, npwx2) + CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_p_d, sbsize, ZERO, buffer_d, npwx2) call gpu_threaded_assign( buffer1_d, sw_d, npwx, l, .true., col_idx_d, .false. ) - CALL gpu_DGEMM('N','N', npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ONE, buffer_d, npwx2) + CALL MYDGEMM('N','N', npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ONE, buffer_d, npwx2) ! sp(:,col_idx(1:l)) = buffer(:,1:l) call gpu_threaded_backassign( sp_d, col_idx_d, buffer_d, npwx, l, .false., sp_d ) call stop_clock('ppcg:dgemm') @@ -624,14 +624,14 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & ! call start_clock('ppcg:dgemm') call gpu_threaded_assign( buffer1_d, w_d, npwx, l, .true., col_idx_d, .false. ) - CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ZERO, buffer_d, npwx2) + CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ZERO, buffer_d, npwx2) ! p(:,col_idx(1:l)) = buffer(:, 1:l) call gpu_threaded_backassign( p_d, col_idx_d, buffer_d, npwx, l, .false., p_d ) call stop_clock('ppcg:dgemm') ! call start_clock('ppcg:dgemm') call gpu_threaded_assign( buffer1_d, hw_d, npwx, l, .true., col_idx_d, .false. ) - CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ZERO, buffer_d, npwx2) + CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ZERO, buffer_d, npwx2) ! hp(:,col_idx(1:l)) = buffer(:, 1:l) call gpu_threaded_backassign( hp_d, col_idx_d, buffer_d, npwx, l, .false., hp_d ) call stop_clock('ppcg:dgemm') @@ -639,7 +639,7 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & if (overlap) then call start_clock('ppcg:dgemm') call gpu_threaded_assign( buffer1_d, sw_d, npwx, l, .true., col_idx_d, .false. ) - CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ZERO, buffer_d, npwx2) + CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_w_d, sbsize, ZERO, buffer_d, npwx2) ! sp(:,col_idx(1:l)) = buffer(:, 1:l) call gpu_threaded_backassign( sp_d, col_idx_d, buffer_d, npwx, l, .false., sp_d ) call stop_clock('ppcg:dgemm') @@ -649,14 +649,14 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & ! Update the sub-blocks of psi and hpsi (and spsi) call start_clock('ppcg:dgemm') call gpu_threaded_assign( buffer1_d, psi_d, npwx, l, .true., col_idx_d ) - CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_psi_d, sbsize, ZERO, buffer_d, npwx2) + CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_psi_d, sbsize, ZERO, buffer_d, npwx2) ! psi(:,col_idx(1:l)) = buffer(:,1:l) + p(:,col_idx(1:l)) call gpu_threaded_backassign( psi_d, col_idx_d, buffer_d, npwx, l, .true., p_d ) call stop_clock('ppcg:dgemm') ! call start_clock('ppcg:dgemm') call gpu_threaded_assign( buffer1_d, hpsi_d, npwx, l, .true., col_idx_d ) - CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_psi_d, sbsize, ZERO, buffer_d, npwx2) + CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_psi_d, sbsize, ZERO, buffer_d, npwx2) ! hpsi(:,col_idx(1:l)) = buffer(:,1:l) + hp(:,col_idx(1:l)) call gpu_threaded_backassign( hpsi_d, col_idx_d, buffer_d, npwx, l, .true., hp_d ) call stop_clock('ppcg:dgemm') @@ -664,7 +664,7 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & if (overlap) then call start_clock('ppcg:dgemm') call gpu_threaded_assign( buffer1_d, spsi_d, npwx, l, .true., col_idx_d, .false. ) - CALL gpu_DGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_psi_d, sbsize, ZERO, buffer_d, npwx2) + CALL MYDGEMM('N','N',npw2, l, l, ONE, buffer1_d, npwx2, coord_psi_d, sbsize, ZERO, buffer_d, npwx2) ! spsi(:,col_idx(1:l)) = buffer(:,1:l) + sp(:,col_idx(1:l)) call gpu_threaded_backassign( spsi_d, col_idx_d, buffer_d, npwx, l, .true., sp_d ) call stop_clock('ppcg:dgemm') @@ -857,8 +857,8 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & call gpu_threaded_memset( G_d, ZERO, nbnd*nbnd ) ! G = ZERO CALL divide(inter_bgrp_comm,nact,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nact,n_start,n_end if (n_start .le. n_end) & - CALL gpu_DGEMM('T','N', nact, my_n, npw2, 2.D0, buffer_d, npwx2, buffer1_d(1,n_start), npwx2, 0.D0, G_d(1,n_start), nbnd) - IF ( gstart == 2 ) CALL gpu_DGER( nact, my_n, -1.D0, buffer_d, npwx2, buffer1_d(1,n_start), npwx2, G_d(1,n_start), nbnd ) + CALL MYDGEMM('T','N', nact, my_n, npw2, 2.D0, buffer_d, npwx2, buffer1_d(1,n_start), npwx2, 0.D0, G_d(1,n_start), nbnd) + IF ( gstart == 2 ) CALL MYDGER( nact, my_n, -1.D0, buffer_d, npwx2, buffer1_d(1,n_start), npwx2, G_d(1,n_start), nbnd ) G = G_d CALL mp_sum(G(1:nact,1:nact), inter_bgrp_comm) ! @@ -875,7 +875,7 @@ SUBROUTINE ppcg_gamma_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & end if call start_clock('ppcg:dgemm') if (n_start .le. n_end) & - CALL gpu_DGEMM('N','N',npw2, nact, my_n, -ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2) + CALL MYDGEMM('N','N',npw2, nact, my_n, -ONE, buffer1_d(1,n_start), npwx2, G_d(n_start,1), nbnd, ONE, buffer_d, npwx2) buffer = buffer_d CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm ) buffer_d = buffer @@ -1779,13 +1779,13 @@ CONTAINS ! this proc sends his block ! CALL mp_bcast( Gl(:,1:nc), root, ortho_parent_comm ) - CALL gpu_DGEMM( 'N','N', n2, nc, nr, ONE, X(1,ir), ld2, Gl, nx, gamm, Xtmp(1,ic), ld2 ) + CALL MYDGEMM( 'N','N', n2, nc, nr, ONE, X(1,ir), ld2, Gl, nx, gamm, Xtmp(1,ic), ld2 ) ELSE ! ! all other procs receive ! CALL mp_bcast( Gltmp(:,1:nc), root, ortho_parent_comm ) - CALL gpu_DGEMM( 'N','N', n2, nc, nr, ONE, X(1,ir), ld2, Gltmp, nx, gamm, Xtmp(1,ic), ld2 ) + CALL MYDGEMM( 'N','N', n2, nc, nr, ONE, X(1,ir), ld2, Gltmp, nx, gamm, Xtmp(1,ic), ld2 ) END IF ! gamm = ONE diff --git a/KS_Solvers/PPCG/ppcg_k_gpu.f90 b/KS_Solvers/PPCG/ppcg_k_gpu.f90 index 685266d57..9b30e2de5 100644 --- a/KS_Solvers/PPCG/ppcg_k_gpu.f90 +++ b/KS_Solvers/PPCG/ppcg_k_gpu.f90 @@ -196,7 +196,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & G_d = C_ZERO CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end if (n_start .le. n_end) & - CALL gpu_ZGEMM('C','N', nbnd, my_n, kdim, C_ONE, psi_d, kdimx, hpsi_d(1,n_start), kdimx, C_ZERO, G_d(1,n_start), nbnd) + CALL MYZGEMM('C','N', nbnd, my_n, kdim, C_ONE, psi_d, kdimx, hpsi_d(1,n_start), kdimx, C_ZERO, G_d(1,n_start), nbnd) CALL mp_sum( G_d, inter_bgrp_comm ) ! CALL mp_sum( G_d, intra_bgrp_comm ) @@ -208,10 +208,10 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end if (overlap) then if (n_start .le. n_end) & - CALL gpu_ZGEMM('N','N',kdim, nbnd, my_n, -C_ONE,spsi_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, w_d, kdimx) + CALL MYZGEMM('N','N',kdim, nbnd, my_n, -C_ONE,spsi_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, w_d, kdimx) else if (n_start .le. n_end) & - CALL gpu_ZGEMM('N','N',kdim, nbnd, my_n, -C_ONE, psi_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, w_d, kdimx) + CALL MYZGEMM('N','N',kdim, nbnd, my_n, -C_ONE, psi_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, w_d, kdimx) end if CALL mp_sum( w_d, inter_bgrp_comm ) call stop_clock('ppcg:zgemm') @@ -267,11 +267,11 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end if (overlap) then if (n_start .le. n_end) & - CALL gpu_ZGEMM( 'C','N', my_n, nact, kdim, C_ONE, spsi_d(1,n_start), kdimx, buffer_d, kdimx, & + CALL MYZGEMM( 'C','N', my_n, nact, kdim, C_ONE, spsi_d(1,n_start), kdimx, buffer_d, kdimx, & C_ZERO, G_d(n_start,1), nbnd ) else if (n_start .le. n_end) & - CALL gpu_ZGEMM( 'C','N', my_n, nact, kdim, C_ONE,psi_d(1,n_start), kdimx, buffer_d, kdimx, & + CALL MYZGEMM( 'C','N', my_n, nact, kdim, C_ONE,psi_d(1,n_start), kdimx, buffer_d, kdimx, & C_ZERO, G_d(n_start,1), nbnd ) end if G = G_d @@ -285,7 +285,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & call start_clock('ppcg:zgemm') call gpu_threaded_assign( buffer_d, w_d, kdimx, nact, .true., act_idx_d, .true. ) if (n_start .le. n_end) & - CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, psi_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, & + CALL MYZGEMM('N','N', kdim, nact, my_n, -C_ONE, psi_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, & buffer_d, kdimx) CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm ) !$cuf kernel DO(2) @@ -377,7 +377,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & call gpu_threaded_assign( buffer1_d, p_d, kdimx, nact, .true., act_idx_d, .false. ) CALL divide(inter_bgrp_comm,nact,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nact,n_start,n_end if (n_start .le. n_end) & - CALL gpu_ZGEMM('C','N', my_n, nact, kdim, C_ONE, buffer_d(1,n_start), kdimx, buffer1_d, & + CALL MYZGEMM('C','N', my_n, nact, kdim, C_ONE, buffer_d(1,n_start), kdimx, buffer1_d, & kdimx, C_ZERO, G_d(n_start,1), nbnd) G = G_d CALL mp_sum( G(1:nact,1:nact), inter_bgrp_comm ) @@ -391,7 +391,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & call gpu_threaded_assign( buffer_d, p_d, kdimx, nact, .true., act_idx_d, .true. ) call gpu_threaded_assign( buffer1_d, psi_d, kdimx, nact, .true., act_idx_d, .false. ) if (n_start .le. n_end) & ! could be done differently - CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), & + CALL MYZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), & nbnd, C_ONE, buffer_d, kdimx) CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm ) !$cuf kernel do(2) @@ -406,7 +406,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & call gpu_threaded_assign( buffer_d, hp_d, kdimx, nact, .true., act_idx_d, .true. ) call gpu_threaded_assign( buffer1_d, hpsi_d, kdimx, nact, .true., act_idx_d, .false. ) if (n_start .le. n_end) & - CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), & + CALL MYZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), & nbnd, C_ONE, buffer_d, kdimx) CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm ) !$cuf kernel do(2) @@ -422,7 +422,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & call gpu_threaded_assign( buffer_d, sp_d, kdimx, nact, .true., act_idx_d, .true. ) call gpu_threaded_assign( buffer1_d, spsi_d, kdimx, nact, .true., act_idx_d, .false. ) if (n_start .le. n_end) & - CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), & + CALL MYZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), & nbnd, C_ONE, buffer_d, kdimx) CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm ) !$cuf kernel do(2) @@ -460,7 +460,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & call start_clock('ppcg:zgemm') call gpu_threaded_assign( buffer_d, psi_d, kdimx, l, .true., col_idx_d, .false. ) call gpu_threaded_assign( buffer1_d, hpsi_d, kdimx, l, .true., col_idx_d, .false. ) - CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, K_d, sbsize3) + CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, K_d, sbsize3) ! if (overlap) then call gpu_threaded_assign( buffer1_d, spsi_d, kdimx, l, .true., col_idx_d, .false. ) @@ -475,19 +475,19 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & else call gpu_threaded_assign( buffer1_d, buffer_d, kdimx, l, .false., col_idx_d, .false. ) end if - CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d, sbsize3) + CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d, sbsize3) ! ! --- call gpu_threaded_assign( buffer_d, w_d, kdimx, l, .true., col_idx_d, .false. ) call gpu_threaded_assign( buffer1_d, hw_d, kdimx, l, .true., col_idx_d, .false. ) - CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, K_d(l+1, l+1), sbsize3) + CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, K_d(l+1, l+1), sbsize3) ! if (overlap) then call gpu_threaded_assign( buffer1_d, sw_d, kdimx, l, .true., col_idx_d, .false. ) else call gpu_threaded_assign( buffer1_d, buffer_d, kdimx, l, .false., col_idx_d, .false. ) end if - CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d(l+1, l+1 ), sbsize3) + CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d(l+1, l+1 ), sbsize3) ! ! --- call gpu_threaded_assign( buffer_d, psi_d, kdimx, l, .true., col_idx_d, .false. ) @@ -500,14 +500,14 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & END DO END DO end if - CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, K_d(1, l+1), sbsize3) + CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, K_d(1, l+1), sbsize3) ! if (overlap) then call gpu_threaded_assign( buffer1_d, sw_d, kdimx, l, .true., col_idx_d, .false. ) else call gpu_threaded_assign( buffer1_d, w_d, kdimx, l, .true., col_idx_d, .false. ) end if - CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d(1, l+1), sbsize3) + CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d(1, l+1), sbsize3) call stop_clock('ppcg:zgemm') ! ! --- @@ -517,7 +517,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & call start_clock('ppcg:zgemm') call gpu_threaded_assign( buffer_d, p_d, kdimx, l, .true., col_idx_d, .false. ) call gpu_threaded_assign( buffer1_d, hp_d, kdimx, l, .true., col_idx_d, .false. ) - CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, & + CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, & K_d(2*l + 1, 2*l+1), sbsize3) ! if (overlap) then @@ -525,13 +525,13 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & else call gpu_threaded_assign( buffer1_d, buffer_d, kdimx, l, .false., col_idx_d, .false. ) end if - CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, & + CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, & M_d(2*l + 1, 2*l+1), sbsize3) ! ! --- call gpu_threaded_assign( buffer_d, psi_d, kdimx, l, .true., col_idx_d, .false. ) call gpu_threaded_assign( buffer1_d, hp_d, kdimx, l, .true., col_idx_d, .false. ) - CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, & + CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, & K_d(1, 2*l+1), sbsize3) ! if (overlap) then @@ -539,7 +539,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & else call gpu_threaded_assign( buffer1_d, p_d, kdimx, l, .true., col_idx_d, .false. ) end if - CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d(1, 2*l+1), sbsize3) + CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d(1, 2*l+1), sbsize3) call stop_clock('ppcg:zgemm') ! ! --- @@ -547,7 +547,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & call start_clock('ppcg:zgemm') call gpu_threaded_assign( buffer_d, w_d, kdimx, l, .true., col_idx_d, .false. ) call gpu_threaded_assign( buffer1_d, hp_d, kdimx, l, .true., col_idx_d, .false. ) - CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, & + CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, & K_d(l+1, 2*l+1), sbsize3) ! if (overlap) then @@ -555,7 +555,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & else call gpu_threaded_assign( buffer1_d, p_d, kdimx, l, .true., col_idx_d, .false. ) end if - CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, & + CALL MYZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, & M_d(l+1, 2*l+1), sbsize3) call stop_clock('ppcg:zgemm') ! @@ -640,9 +640,9 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & ! call start_clock('ppcg:zgemm') call gpu_threaded_assign( buffer1_d, p_d, kdimx, l, .true., col_idx_d, .false. ) - CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_p_d, sbsize, C_ZERO, buffer_d, kdimx) + CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_p_d, sbsize, C_ZERO, buffer_d, kdimx) call gpu_threaded_assign( buffer1_d, w_d, kdimx, l, .true., col_idx_d, .false. ) - CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ONE, buffer_d, kdimx) + CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ONE, buffer_d, kdimx) !$cuf kernel do(2) DO ii = 1, kdimx DO jj = 1, l @@ -653,9 +653,9 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & ! call start_clock('ppcg:zgemm') call gpu_threaded_assign( buffer1_d, hp_d, kdimx, l, .true., col_idx_d, .false. ) - CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_p_d, sbsize, C_ZERO, buffer_d, kdimx) + CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_p_d, sbsize, C_ZERO, buffer_d, kdimx) call gpu_threaded_assign( buffer1_d, hw_d, kdimx, l, .true., col_idx_d, .false. ) - CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ONE, buffer_d, kdimx) + CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ONE, buffer_d, kdimx) !$cuf kernel do(2) DO ii = 1, kdimx DO jj = 1, l @@ -667,9 +667,9 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & if (overlap) then call start_clock('ppcg:zgemm') call gpu_threaded_assign( buffer1_d, sp_d, kdimx, l, .true., col_idx_d, .false. ) - CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_p_d, sbsize, C_ZERO, buffer_d, kdimx) + CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_p_d, sbsize, C_ZERO, buffer_d, kdimx) call gpu_threaded_assign( buffer1_d, sw_d, kdimx, l, .true., col_idx_d, .false. ) - CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ONE, buffer_d, kdimx) + CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ONE, buffer_d, kdimx) !$cuf kernel do(2) DO ii = 1, kdimx DO jj = 1, l @@ -682,7 +682,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & ! call start_clock('ppcg:zgemm') call gpu_threaded_assign( buffer1_d, w_d, kdimx, l, .true., col_idx_d, .false. ) - CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ZERO, buffer_d, kdimx) + CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ZERO, buffer_d, kdimx) !$cuf kernel do(2) DO ii = 1, kdimx DO jj = 1, l @@ -693,7 +693,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & ! call start_clock('ppcg:zgemm') call gpu_threaded_assign( buffer1_d, hw_d, kdimx, l, .true., col_idx_d, .false. ) - CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ZERO, buffer_d, kdimx) + CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, C_ZERO, buffer_d, kdimx) !$cuf kernel do(2) DO ii = 1, kdimx DO jj = 1, l @@ -705,7 +705,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & if (overlap) then call start_clock('ppcg:zgemm') call gpu_threaded_assign( buffer1_d, sw_d, kdimx, l, .true., col_idx_d, .false. ) - CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, c_ZERO, buffer_d, kdimx) + CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_w_d, sbsize, c_ZERO, buffer_d, kdimx) !$cuf kernel do(2) DO ii = 1, kdimx DO jj = 1, l @@ -719,7 +719,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & ! Update the sub-blocks of psi and hpsi (and spsi) call start_clock('ppcg:zgemm') call gpu_threaded_assign( buffer1_d, psi_d, kdimx, l, .true., col_idx_d, .false. ) - CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_psi_d, sbsize, C_ZERO, buffer_d, kdimx) + CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_psi_d, sbsize, C_ZERO, buffer_d, kdimx) !$cuf kernel do(2) DO ii = 1, kdimx DO jj = 1, l @@ -730,7 +730,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & ! call start_clock('ppcg:zgemm') call gpu_threaded_assign( buffer1_d, hpsi_d, kdimx, l, .true., col_idx_d, .false. ) - CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_psi_d, sbsize, C_ZERO, buffer_d, kdimx) + CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_psi_d, sbsize, C_ZERO, buffer_d, kdimx) !$cuf kernel do(2) DO ii = 1, kdimx DO jj = 1, l @@ -742,7 +742,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & if (overlap) then call start_clock('ppcg:zgemm') call gpu_threaded_assign( buffer1_d, spsi_d, kdimx, l, .true., col_idx_d, .false. ) - CALL gpu_ZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_psi_d, sbsize, C_ZERO, buffer_d, kdimx) + CALL MYZGEMM('N','N', kdim, l, l, C_ONE, buffer1_d, kdimx, coord_psi_d, sbsize, C_ZERO, buffer_d, kdimx) !$cuf kernel do(2) DO ii = 1, kdimx DO jj = 1, l @@ -968,7 +968,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & END DO CALL divide(inter_bgrp_comm,nact,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nact,n_start,n_end if (n_start .le. n_end) & - CALL gpu_ZGEMM('C','N', nact, my_n, kdim, C_ONE, buffer_d, kdimx, buffer1_d(1,n_start), kdimx, & + CALL MYZGEMM('C','N', nact, my_n, kdim, C_ONE, buffer_d, kdimx, buffer1_d(1,n_start), kdimx, & C_ZERO, G_d(1,n_start), nbnd) G = G_d CALL mp_sum(G(1:nact,1:nact), inter_bgrp_comm) @@ -986,7 +986,7 @@ SUBROUTINE ppcg_k_gpu( h_psi_ptr, s_psi_ptr, overlap, precondition_d, & end if call start_clock('ppcg:zgemm') if (n_start .le. n_end) & - CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), & + CALL MYZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), & nbnd, C_ONE, buffer_d, kdimx) CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm ) !$cuf kernel do(2) @@ -1792,13 +1792,13 @@ CONTAINS ! this proc sends his block ! CALL mp_bcast( Gl(:,1:nc), root, ortho_parent_comm ) - CALL gpu_ZGEMM( 'N','N', n, nc, nr, C_ONE, X(1,ir), ld, Gl, nx, gamm, Xtmp(1,ic), ld ) + CALL MYZGEMM( 'N','N', n, nc, nr, C_ONE, X(1,ir), ld, Gl, nx, gamm, Xtmp(1,ic), ld ) ELSE ! ! all other procs receive ! CALL mp_bcast( Gltmp(:,1:nc), root, ortho_parent_comm ) - CALL gpu_ZGEMM( 'N','N', n, nc, nr, C_ONE, X(1,ir), ld, Gltmp, nx, gamm, Xtmp(1,ic), ld ) + CALL MYZGEMM( 'N','N', n, nc, nr, C_ONE, X(1,ir), ld, Gltmp, nx, gamm, Xtmp(1,ic), ld ) END IF ! gamm = C_ONE diff --git a/KS_Solvers/RMM/crmmdiagg_gpu.f90 b/KS_Solvers/RMM/crmmdiagg_gpu.f90 index 61337b715..3a29265aa 100644 --- a/KS_Solvers/RMM/crmmdiagg_gpu.f90 +++ b/KS_Solvers/RMM/crmmdiagg_gpu.f90 @@ -285,7 +285,7 @@ CONTAINS ! INTEGER :: ibnd ! - COMPLEX(DP), EXTERNAL :: ZDOTC_gpu + COMPLEX(DP), EXTERNAL :: MYZDOTC ! ! ... Operate the Hamiltonian : H |psi> ! @@ -312,7 +312,7 @@ CONTAINS !$acc host_data use_device(psi, hpsi) DO ibnd = ibnd_start, ibnd_end ! - hw(ibnd) = DBLE( ZDOTC_gpu( kdim, psi(1,ibnd), 1, hpsi(1,ibnd), 1 ) ) + hw(ibnd) = DBLE( MYZDOTC( kdim, psi(1,ibnd), 1, hpsi(1,ibnd), 1 ) ) ! END DO !$acc end host_data @@ -326,11 +326,11 @@ CONTAINS ! IF ( uspp ) THEN ! - sw(ibnd) = DBLE( ZDOTC_gpu( kdim, psi(1,ibnd), 1, spsi(1,ibnd), 1 ) ) + sw(ibnd) = DBLE( MYZDOTC( kdim, psi(1,ibnd), 1, spsi(1,ibnd), 1 ) ) ! ELSE ! - sw(ibnd) = DBLE( ZDOTC_gpu( kdim, psi(1,ibnd), 1, psi(1,ibnd), 1 ) ) + sw(ibnd) = DBLE( MYZDOTC( kdim, psi(1,ibnd), 1, psi(1,ibnd), 1 ) ) ! END IF ! @@ -394,10 +394,10 @@ CONTAINS IF ( conv(ibnd) ) CYCLE ! !$acc host_data use_device(psi, hpsi, spsi) - CALL ZCOPY_gpu( kdim, psi (1,ibnd), 1, phi_d (1,ibnd,idiis), 1 ) - CALL ZCOPY_gpu( kdim, hpsi(1,ibnd), 1, hphi_d(1,ibnd,idiis), 1 ) + CALL MYZCOPY( kdim, psi (1,ibnd), 1, phi_d (1,ibnd,idiis), 1 ) + CALL MYZCOPY( kdim, hpsi(1,ibnd), 1, hphi_d(1,ibnd,idiis), 1 ) IF ( uspp ) & - CALL ZCOPY_gpu( kdim, spsi(1,ibnd), 1, sphi_d(1,ibnd,idiis), 1 ) + CALL MYZCOPY( kdim, spsi(1,ibnd), 1, sphi_d(1,ibnd,idiis), 1 ) !$acc end host_data ! php(ibnd,idiis) = hw(ibnd) @@ -419,15 +419,15 @@ CONTAINS ! ec = CMPLX( php(ibnd,kdiis), 0._DP, kind=DP ) ! - CALL ZCOPY_gpu( kdim, hphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 ) + CALL MYZCOPY( kdim, hphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 ) ! IF ( uspp ) THEN ! - CALL ZAXPY_gpu( kdim, -ec, sphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 ) + CALL MYZAXPY( kdim, -ec, sphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 ) ! ELSE ! - CALL ZAXPY_gpu( kdim, -ec, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 ) + CALL MYZAXPY( kdim, -ec, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 ) ! END IF ! @@ -435,19 +435,19 @@ CONTAINS ! ec = CMPLX( php(ibnd,idiis), 0._DP, kind=DP ) ! - CALL ZCOPY_gpu( kdim, hphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 ) + CALL MYZCOPY( kdim, hphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 ) ! IF ( uspp ) THEN ! - CALL ZAXPY_gpu( kdim, -ec, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 ) + CALL MYZAXPY( kdim, -ec, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 ) ! ELSE ! - CALL ZAXPY_gpu( kdim, -ec, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 ) + CALL MYZAXPY( kdim, -ec, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 ) ! END IF ! - CALL ZGEMV_gpu( 'C', kdim, idiis, ONE, vec2_d(1,1), kdmx, & + CALL MYZGEMV( 'C', kdim, idiis, ONE, vec2_d(1,1), kdmx, & vec1_d(1), 1, ZERO, tc_d(1,jbnd), 1 ) ! END DO @@ -482,21 +482,21 @@ CONTAINS ! DO kdiis = 1, idiis ! - CALL ZCOPY_gpu( kdim, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 ) + CALL MYZCOPY( kdim, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 ) ! END DO ! IF ( uspp ) THEN ! - CALL ZCOPY_gpu( kdim, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 ) + CALL MYZCOPY( kdim, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 ) ! ELSE ! - CALL ZCOPY_gpu( kdim, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 ) + CALL MYZCOPY( kdim, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 ) ! END IF ! - CALL ZGEMV_gpu( 'C', kdim, idiis, ONE, vec2_d(1,1), kdmx, & + CALL MYZGEMV( 'C', kdim, idiis, ONE, vec2_d(1,1), kdmx, & vec1_d(1), 1, ZERO, tc_d(1,jbnd), 1 ) ! END DO @@ -550,29 +550,29 @@ CONTAINS kvc = vc(kdiis) ! !$acc host_data use_device(psi, hpsi, spsi) - CALL ZAXPY_gpu( kdim, kvc, phi_d (1,ibnd,kdiis), 1, psi(1,ibnd), 1 ) - CALL ZAXPY_gpu( kdim, kvc, hphi_d (1,ibnd,kdiis), 1, hpsi (1,ibnd), 1 ) - IF (uspp) CALL ZAXPY_gpu( kdim, kvc, sphi_d (1,ibnd,kdiis), 1, spsi (1,ibnd), 1 ) + CALL MYZAXPY( kdim, kvc, phi_d (1,ibnd,kdiis), 1, psi(1,ibnd), 1 ) + CALL MYZAXPY( kdim, kvc, hphi_d (1,ibnd,kdiis), 1, hpsi (1,ibnd), 1 ) + IF (uspp) CALL MYZAXPY( kdim, kvc, sphi_d (1,ibnd,kdiis), 1, spsi (1,ibnd), 1 ) !$acc end host_data ! ! ... Residual vectors ! ec = CMPLX( php(ibnd,kdiis), 0._DP, kind=DP ) ! - CALL ZCOPY_gpu( kdim, hphi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 ) + CALL MYZCOPY( kdim, hphi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 ) ! IF ( uspp ) THEN ! - CALL ZAXPY_gpu( kdim, -ec, sphi_d (1,ibnd,kdiis), 1, vec1_d (1), 1 ) + CALL MYZAXPY( kdim, -ec, sphi_d (1,ibnd,kdiis), 1, vec1_d (1), 1 ) ! ELSE ! - CALL ZAXPY_gpu( kdim, -ec, phi_d (1,ibnd,kdiis), 1, vec1_d (1), 1 ) + CALL MYZAXPY( kdim, -ec, phi_d (1,ibnd,kdiis), 1, vec1_d (1), 1 ) ! END IF ! !$acc host_data use_device(kpsi) - CALL ZAXPY_gpu( kdim, kvc, vec1_d (1), 1, kpsi (1,kbnd), 1 ) + CALL MYZAXPY( kdim, kvc, vec1_d (1), 1, kpsi (1,kbnd), 1 ) !$acc end host_data ! END DO @@ -583,10 +583,10 @@ CONTAINS ! norm = SQRT( sw(ibnd) ) !$acc host_data use_device(psi, hpsi, spsi) - CALL ZDSCAL_gpu( kdim, 1._DP / norm, psi (1,ibnd), 1 ) - CALL ZDSCAL_gpu( kdim, 1._DP / norm, hpsi(1,ibnd), 1 ) + CALL MYZDSCAL( kdim, 1._DP / norm, psi (1,ibnd), 1 ) + CALL MYZDSCAL( kdim, 1._DP / norm, hpsi(1,ibnd), 1 ) IF ( uspp ) & - CALL ZDSCAL_gpu( kdim, 1._DP / norm, spsi(1,ibnd), 1 ) + CALL MYZDSCAL( kdim, 1._DP / norm, spsi(1,ibnd), 1 ) !$acc end host_data ! ! ... Residual vectors @@ -594,15 +594,15 @@ CONTAINS ec = CMPLX( hw(ibnd), 0._DP, kind=DP ) ! !$acc host_data use_device(psi, spsi, hpsi, kpsi) - CALL ZCOPY_gpu( kdim, hpsi(1,ibnd), 1, kpsi(1,kbnd), 1 ) + CALL MYZCOPY( kdim, hpsi(1,ibnd), 1, kpsi(1,kbnd), 1 ) ! IF ( uspp ) THEN ! - CALL ZAXPY_gpu( kdim, -ec, spsi(1,ibnd), 1, kpsi(1,kbnd), 1 ) + CALL MYZAXPY( kdim, -ec, spsi(1,ibnd), 1, kpsi(1,kbnd), 1 ) ! ELSE ! - CALL ZAXPY_gpu( kdim, -ec, psi(1,ibnd), 1, kpsi(1,kbnd), 1 ) + CALL MYZAXPY( kdim, -ec, psi(1,ibnd), 1, kpsi(1,kbnd), 1 ) ! END IF !$acc end host_data @@ -766,7 +766,7 @@ CONTAINS COMPLEX(DP) :: z1, z2 REAL(DP), ALLOCATABLE :: coef(:,:) ! - COMPLEX(DP), EXTERNAL :: ZDOTC_gpu + COMPLEX(DP), EXTERNAL :: MYZDOTC ! REAL(DP) :: ekinj INTEGER :: idx @@ -901,21 +901,21 @@ CONTAINS kbnd = ibnd_index(ibnd) ! !$acc host_data use_device(psi, hpsi, spsi, kpsi, hkpsi, skpsi) - php = DBLE( ZDOTC_gpu( kdim, psi (1,ibnd), 1, hpsi (1,ibnd), 1 ) ) - khp = DBLE( ZDOTC_gpu( kdim, kpsi(1,kbnd), 1, hpsi (1,ibnd), 1 ) ) - khk = DBLE( ZDOTC_gpu( kdim, kpsi(1,kbnd), 1, hkpsi(1,kbnd), 1 ) ) + php = DBLE( MYZDOTC( kdim, psi (1,ibnd), 1, hpsi (1,ibnd), 1 ) ) + khp = DBLE( MYZDOTC( kdim, kpsi(1,kbnd), 1, hpsi (1,ibnd), 1 ) ) + khk = DBLE( MYZDOTC( kdim, kpsi(1,kbnd), 1, hkpsi(1,kbnd), 1 ) ) ! IF ( uspp ) THEN ! - psp = DBLE( ZDOTC_gpu( kdim, psi (1,ibnd), 1, spsi (1,ibnd), 1 ) ) - ksp = DBLE( ZDOTC_gpu( kdim, kpsi(1,kbnd), 1, spsi (1,ibnd), 1 ) ) - ksk = DBLE( ZDOTC_gpu( kdim, kpsi(1,kbnd), 1, skpsi(1,kbnd), 1 ) ) + psp = DBLE( MYZDOTC( kdim, psi (1,ibnd), 1, spsi (1,ibnd), 1 ) ) + ksp = DBLE( MYZDOTC( kdim, kpsi(1,kbnd), 1, spsi (1,ibnd), 1 ) ) + ksk = DBLE( MYZDOTC( kdim, kpsi(1,kbnd), 1, skpsi(1,kbnd), 1 ) ) ! ELSE ! - psp = DBLE( ZDOTC_gpu( kdim, psi (1,ibnd), 1, psi (1,ibnd), 1 ) ) - ksp = DBLE( ZDOTC_gpu( kdim, kpsi(1,kbnd), 1, psi (1,ibnd), 1 ) ) - ksk = DBLE( ZDOTC_gpu( kdim, kpsi(1,kbnd), 1, kpsi(1,kbnd), 1 ) ) + psp = DBLE( MYZDOTC( kdim, psi (1,ibnd), 1, psi (1,ibnd), 1 ) ) + ksp = DBLE( MYZDOTC( kdim, kpsi(1,kbnd), 1, psi (1,ibnd), 1 ) ) + ksk = DBLE( MYZDOTC( kdim, kpsi(1,kbnd), 1, kpsi(1,kbnd), 1 ) ) ! END IF !$acc end host_data @@ -1029,16 +1029,16 @@ CONTAINS z2 = CMPLX( coef(2,jbnd), 0._DP, kind=DP ) ! !$acc host_data use_device(psi, hpsi, spsi, kpsi, hkpsi, skpsi) - CALL ZSCAL_gpu( kdim, z1, psi (1,ibnd), 1 ) - CALL ZAXPY_gpu( kdim, z2, kpsi(1,kbnd), 1, psi(1,ibnd), 1 ) + CALL MYZSCAL( kdim, z1, psi (1,ibnd), 1 ) + CALL MYZAXPY( kdim, z2, kpsi(1,kbnd), 1, psi(1,ibnd), 1 ) ! - CALL ZSCAL_gpu( kdim, z1, hpsi (1,ibnd), 1 ) - CALL ZAXPY_gpu( kdim, z2, hkpsi(1,kbnd), 1, hpsi(1,ibnd), 1 ) + CALL MYZSCAL( kdim, z1, hpsi (1,ibnd), 1 ) + CALL MYZAXPY( kdim, z2, hkpsi(1,kbnd), 1, hpsi(1,ibnd), 1 ) ! IF ( uspp ) THEN ! - CALL ZSCAL_gpu( kdim, z1, spsi (1,ibnd), 1 ) - CALL ZAXPY_gpu( kdim, z2, skpsi(1,kbnd), 1, spsi(1,ibnd), 1 ) + CALL MYZSCAL( kdim, z1, spsi (1,ibnd), 1 ) + CALL MYZAXPY( kdim, z2, skpsi(1,kbnd), 1, spsi(1,ibnd), 1 ) ! END IF !$acc end host_data diff --git a/KS_Solvers/RMM/rrmmdiagg_gpu.f90 b/KS_Solvers/RMM/rrmmdiagg_gpu.f90 index 81b67f928..4d4249894 100644 --- a/KS_Solvers/RMM/rrmmdiagg_gpu.f90 +++ b/KS_Solvers/RMM/rrmmdiagg_gpu.f90 @@ -297,7 +297,7 @@ CONTAINS ! INTEGER :: ibnd ! - REAL(DP), EXTERNAL :: gpu_DDOT + REAL(DP), EXTERNAL :: MYDDOT ! ! ... Operate the Hamiltonian : H |psi> ! @@ -324,10 +324,10 @@ CONTAINS !$acc host_data use_device(psi, hpsi) DO ibnd = ibnd_start, ibnd_end ! - hw(ibnd) = 2._DP * gpu_DDOT( npw2, psi(1,ibnd), 1, hpsi(1,ibnd), 1 ) + hw(ibnd) = 2._DP * MYDDOT( npw2, psi(1,ibnd), 1, hpsi(1,ibnd), 1 ) ! IF ( gstart == 2 ) & - hw(ibnd)= hw(ibnd) - gpu_DDOT( 1, psi(1,ibnd), 1, hpsi(1,ibnd),1) + hw(ibnd)= hw(ibnd) - MYDDOT( 1, psi(1,ibnd), 1, hpsi(1,ibnd),1) ! END DO !$acc end host_data @@ -341,17 +341,17 @@ CONTAINS ! IF ( uspp ) THEN ! - sw(ibnd) = 2._DP * gpu_DDOT( npw2, psi(1,ibnd), 1, spsi(1,ibnd), 1 ) + sw(ibnd) = 2._DP * MYDDOT( npw2, psi(1,ibnd), 1, spsi(1,ibnd), 1 ) ! IF ( gstart == 2 ) & - sw(ibnd)= sw(ibnd) - gpu_DDOT( 1, psi(1,ibnd), 1, spsi(1,ibnd),1) + sw(ibnd)= sw(ibnd) - MYDDOT( 1, psi(1,ibnd), 1, spsi(1,ibnd),1) ! ELSE ! - sw(ibnd) = 2._DP * gpu_DDOT( npw2, psi(1,ibnd), 1, psi(1,ibnd), 1 ) + sw(ibnd) = 2._DP * MYDDOT( npw2, psi(1,ibnd), 1, psi(1,ibnd), 1 ) ! IF ( gstart == 2 ) & - sw(ibnd )= sw(ibnd) - gpu_DDOT( 1, psi(1,ibnd), 1, psi(1,ibnd),1) + sw(ibnd )= sw(ibnd) - MYDDOT( 1, psi(1,ibnd), 1, psi(1,ibnd),1) ! END IF ! @@ -413,10 +413,10 @@ CONTAINS IF ( conv(ibnd) ) CYCLE ! !$acc host_data use_device(psi, hpsi, spsi) - CALL DCOPY_gpu( npw2, psi (1,ibnd), 1, phi_d (1,ibnd,idiis), 1 ) - CALL DCOPY_gpu( npw2, hpsi(1,ibnd), 1, hphi_d(1,ibnd,idiis), 1 ) + CALL MYDCOPY( npw2, psi (1,ibnd), 1, phi_d (1,ibnd,idiis), 1 ) + CALL MYDCOPY( npw2, hpsi(1,ibnd), 1, hphi_d(1,ibnd,idiis), 1 ) IF ( uspp ) & - CALL DCOPY_gpu( npw2, spsi(1,ibnd), 1, sphi_d(1,ibnd,idiis), 1 ) + CALL MYDCOPY( npw2, spsi(1,ibnd), 1, sphi_d(1,ibnd,idiis), 1 ) !$acc end host_data ! php(ibnd,idiis) = hw(ibnd) @@ -438,15 +438,15 @@ CONTAINS ! er = php(ibnd,kdiis) ! - CALL DCOPY_gpu( npw2, hphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 ) + CALL MYDCOPY( npw2, hphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 ) ! IF ( uspp ) THEN ! - CALL DAXPY_gpu( npw2, -er, sphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 ) + CALL MYDAXPY( npw2, -er, sphi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 ) ! ELSE ! - CALL DAXPY_gpu( npw2, -er, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 ) + CALL MYDAXPY( npw2, -er, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 ) ! END IF ! @@ -454,19 +454,19 @@ CONTAINS ! er = php(ibnd,idiis) ! - CALL DCOPY_gpu( npw2, hphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 ) + CALL MYDCOPY( npw2, hphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 ) ! IF ( uspp ) THEN ! - CALL DAXPY_gpu( npw2, -er, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 ) + CALL MYDAXPY( npw2, -er, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 ) ! ELSE ! - CALL DAXPY_gpu( npw2, -er, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 ) + CALL MYDAXPY( npw2, -er, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 ) ! END IF ! - CALL DGEMV_gpu( 'T', npw2, idiis, 2._DP, vec2_d(1,1), npwx2, & + CALL MYDGEMV( 'T', npw2, idiis, 2._DP, vec2_d(1,1), npwx2, & vec1_d(1), 1, 0._DP, tr_d(1,jbnd), 1 ) ! IF ( gstart == 2 ) THEN @@ -507,21 +507,21 @@ CONTAINS ! DO kdiis = 1, idiis ! - CALL DCOPY_gpu( npw2, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 ) + CALL MYDCOPY( npw2, phi_d(1,ibnd,kdiis), 1, vec2_d(1,kdiis), 1 ) ! END DO ! IF ( uspp ) THEN ! - CALL DCOPY_gpu( npw2, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 ) + CALL MYDCOPY( npw2, sphi_d(1,ibnd,idiis), 1, vec1_d(1), 1 ) ! ELSE ! - CALL DCOPY_gpu( npw2, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 ) + CALL MYDCOPY( npw2, phi_d(1,ibnd,idiis), 1, vec1_d(1), 1 ) ! END IF ! - CALL DGEMV_gpu( 'T', npw2, idiis, 2._DP, vec2_d(1,1), npwx2, & + CALL MYDGEMV( 'T', npw2, idiis, 2._DP, vec2_d(1,1), npwx2, & vec1_d(1), 1, 0._DP, tr_d(1,jbnd), 1 ) ! IF ( gstart == 2 ) THEN @@ -580,30 +580,30 @@ CONTAINS ! kvr = vr(kdiis) !$acc host_data use_device(psi, hpsi, spsi) - CALL DAXPY_gpu( npw2, kvr, phi_d (1,ibnd,kdiis), 1, psi (1,ibnd), 1 ) - CALL DAXPY_gpu( npw2, kvr, hphi_d(1,ibnd,kdiis), 1, hpsi(1,ibnd), 1 ) + CALL MYDAXPY( npw2, kvr, phi_d (1,ibnd,kdiis), 1, psi (1,ibnd), 1 ) + CALL MYDAXPY( npw2, kvr, hphi_d(1,ibnd,kdiis), 1, hpsi(1,ibnd), 1 ) IF ( uspp ) & - CALL DAXPY_gpu( npw2, kvr, sphi_d(1,ibnd,kdiis), 1, spsi(1,ibnd), 1 ) + CALL MYDAXPY( npw2, kvr, sphi_d(1,ibnd,kdiis), 1, spsi(1,ibnd), 1 ) !$acc end host_data ! ! ... Residual vectors ! er = php(ibnd,kdiis) ! - CALL DCOPY_gpu( npw2, hphi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 ) + CALL MYDCOPY( npw2, hphi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 ) ! IF ( uspp ) THEN ! - CALL DAXPY_gpu( npw2, -er, sphi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 ) + CALL MYDAXPY( npw2, -er, sphi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 ) ! ELSE ! - CALL DAXPY_gpu( npw2, -er, phi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 ) + CALL MYDAXPY( npw2, -er, phi_d(1,ibnd,kdiis), 1, vec1_d(1), 1 ) ! END IF ! !$acc host_data use_device(kpsi) - CALL DAXPY_gpu( npw2, kvr, vec1_d(1), 1, kpsi(1,kbnd), 1 ) + CALL MYDAXPY( npw2, kvr, vec1_d(1), 1, kpsi(1,kbnd), 1 ) !$acc end host_data ! END DO @@ -614,10 +614,10 @@ CONTAINS ! norm = SQRT( sw(ibnd) ) !$acc host_data use_device(psi, hpsi, spsi) - CALL DSCAL_gpu( npw2, 1._DP / norm, psi (1,ibnd), 1 ) - CALL DSCAL_gpu( npw2, 1._DP / norm, hpsi(1,ibnd), 1 ) + CALL MYDSCAL( npw2, 1._DP / norm, psi (1,ibnd), 1 ) + CALL MYDSCAL( npw2, 1._DP / norm, hpsi(1,ibnd), 1 ) IF ( uspp ) & - CALL DSCAL_gpu( npw2, 1._DP / norm, spsi(1,ibnd), 1 ) + CALL MYDSCAL( npw2, 1._DP / norm, spsi(1,ibnd), 1 ) !$acc end host_data ! ! ... Residual vectors @@ -625,21 +625,21 @@ CONTAINS er = hw(ibnd) ! !$acc host_data use_device(hpsi, kpsi) - CALL DCOPY_gpu( npw2, hpsi(1,ibnd), 1, kpsi(1,kbnd), 1 ) + CALL MYDCOPY( npw2, hpsi(1,ibnd), 1, kpsi(1,kbnd), 1 ) !$acc end host_data ! IF ( uspp ) THEN ! !$acc host_data use_device(spsi, kpsi) - CALL DAXPY_gpu( npw2, -er, spsi(1,ibnd), 1, kpsi(1,kbnd), 1 ) + CALL MYDAXPY( npw2, -er, spsi(1,ibnd), 1, kpsi(1,kbnd), 1 ) !$acc end host_data ! ELSE ! !civn: here changed spsi --> psi due to a possible typo in the original version - !CALL DAXPY_gpu( npw2, -er, spsi_d(1,ibnd), 1, kpsi_d(1,kbnd), 1 ) + !CALL MYDAXPY( npw2, -er, spsi_d(1,ibnd), 1, kpsi_d(1,kbnd), 1 ) !$acc host_data use_device(psi, kpsi) - CALL DAXPY_gpu( npw2, -er, psi(1,ibnd), 1, kpsi(1,kbnd), 1 ) + CALL MYDAXPY( npw2, -er, psi(1,ibnd), 1, kpsi(1,kbnd), 1 ) !$acc end host_data ! END IF @@ -819,7 +819,7 @@ CONTAINS REAL(DP) :: c1, c2 REAL(DP), ALLOCATABLE :: coef(:,:) ! - REAL(DP), EXTERNAL :: gpu_DDOT + REAL(DP), EXTERNAL :: MYDDOT !civn INTEGER :: idx REAL(DP) :: ekinj @@ -989,9 +989,9 @@ CONTAINS kbnd = ibnd_index(ibnd) ! !$acc host_data use_device(psi, hpsi, kpsi, hkpsi) - php = 2._DP * gpu_DDOT( npw2, psi (1,ibnd), 1, hpsi (1,ibnd), 1 ) - khp = 2._DP * gpu_DDOT( npw2, kpsi(1,kbnd), 1, hpsi (1,ibnd), 1 ) - khk = 2._DP * gpu_DDOT( npw2, kpsi(1,kbnd), 1, hkpsi(1,kbnd), 1 ) + php = 2._DP * MYDDOT( npw2, psi (1,ibnd), 1, hpsi (1,ibnd), 1 ) + khp = 2._DP * MYDDOT( npw2, kpsi(1,kbnd), 1, hpsi (1,ibnd), 1 ) + khk = 2._DP * MYDDOT( npw2, kpsi(1,kbnd), 1, hkpsi(1,kbnd), 1 ) !$acc end host_data ! IF ( gstart == 2 ) THEN @@ -1020,9 +1020,9 @@ CONTAINS IF ( uspp ) THEN ! !$acc host_data use_device(psi, spsi, kpsi, skpsi) - psp = 2._DP * gpu_DDOT( npw2, psi (1,ibnd), 1, spsi (1,ibnd), 1 ) - ksp = 2._DP * gpu_DDOT( npw2, kpsi(1,kbnd), 1, spsi (1,ibnd), 1 ) - ksk = 2._DP * gpu_DDOT( npw2, kpsi(1,kbnd), 1, skpsi(1,kbnd), 1 ) + psp = 2._DP * MYDDOT( npw2, psi (1,ibnd), 1, spsi (1,ibnd), 1 ) + ksp = 2._DP * MYDDOT( npw2, kpsi(1,kbnd), 1, spsi (1,ibnd), 1 ) + ksk = 2._DP * MYDDOT( npw2, kpsi(1,kbnd), 1, skpsi(1,kbnd), 1 ) !$acc end host_data ! IF ( gstart == 2 ) THEN @@ -1051,9 +1051,9 @@ CONTAINS ELSE ! !$acc host_data use_device(psi, kpsi) - psp = 2._DP * gpu_DDOT( npw2, psi (1,ibnd), 1, psi (1,ibnd), 1 ) - ksp = 2._DP * gpu_DDOT( npw2, kpsi(1,kbnd), 1, psi (1,ibnd), 1 ) - ksk = 2._DP * gpu_DDOT( npw2, kpsi(1,kbnd), 1, kpsi(1,kbnd), 1 ) + psp = 2._DP * MYDDOT( npw2, psi (1,ibnd), 1, psi (1,ibnd), 1 ) + ksp = 2._DP * MYDDOT( npw2, kpsi(1,kbnd), 1, psi (1,ibnd), 1 ) + ksk = 2._DP * MYDDOT( npw2, kpsi(1,kbnd), 1, kpsi(1,kbnd), 1 ) !$acc end host_data ! IF ( gstart == 2 ) THEN @@ -1190,16 +1190,16 @@ CONTAINS c2 = coef(2,jbnd) ! !$acc host_data use_device(psi, hpsi, spsi, kpsi, hkpsi, skpsi) - CALL DSCAL_gpu( npw2, c1, psi (1,ibnd), 1 ) - CALL DAXPY_gpu( npw2, c2, kpsi(1,kbnd), 1, psi(1,ibnd), 1 ) + CALL MYDSCAL( npw2, c1, psi (1,ibnd), 1 ) + CALL MYDAXPY( npw2, c2, kpsi(1,kbnd), 1, psi(1,ibnd), 1 ) ! - CALL DSCAL_gpu( npw2, c1, hpsi (1,ibnd), 1 ) - CALL DAXPY_gpu( npw2, c2, hkpsi(1,kbnd), 1, hpsi(1,ibnd), 1 ) + CALL MYDSCAL( npw2, c1, hpsi (1,ibnd), 1 ) + CALL MYDAXPY( npw2, c2, hkpsi(1,kbnd), 1, hpsi(1,ibnd), 1 ) ! IF ( uspp ) THEN ! - CALL DSCAL_gpu( npw2, c1, spsi (1,ibnd), 1 ) - CALL DAXPY_gpu( npw2, c2, skpsi(1,kbnd), 1, spsi(1,ibnd), 1 ) + CALL MYDSCAL( npw2, c1, spsi (1,ibnd), 1 ) + CALL MYDAXPY( npw2, c2, skpsi(1,kbnd), 1, spsi(1,ibnd), 1 ) ! END IF !$acc end host_data diff --git a/UtilXlib/device_helper.f90 b/UtilXlib/device_helper.f90 index 8c2a6956f..fc84c1e41 100644 --- a/UtilXlib/device_helper.f90 +++ b/UtilXlib/device_helper.f90 @@ -222,3 +222,168 @@ USE cublas return end subroutine MYDDOTv3 +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +subroutine MYDTRSM(side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb) +#if defined(__CUDA) +USE cublas +#endif +implicit none + character*1 :: side, uplo, transa, diag + integer :: m, n, lda, ldb + DOUBLE PRECISION :: alpha + DOUBLE PRECISION, dimension(lda, *) :: a + DOUBLE PRECISION, dimension(ldb, *) :: b +#if defined(__CUDA) + attributes(device) :: a, b + call cublasDTRSM(side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb) +#endif + return +end subroutine MYDTRSM +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +DOUBLE COMPLEX function MYZDOTC(n, zx, incx, zy, incy) +#if defined(__CUDA) +USE cublas +#endif +implicit none + integer :: n, incx, incy + DOUBLE COMPLEX, dimension(*) :: zx, zy +#if defined(__CUDA) + attributes(device) :: zx, zy + MYZDOTC = cublasZDOTC(n, zx, incx, zy, incy) +#endif + return +end function MYZDOTC +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +SUBROUTINE MYZSWAP(n, zx, incx, zy, incy) +#if defined(__CUDA) +USE cublas +#endif +implicit none + integer :: n, incx, incy + DOUBLE COMPLEX, dimension(*) :: zx, zy +#if defined(__CUDA) + attributes(device) :: zx, zy + CALL cublasZSWAP(n, zx, incx, zy, incy) +#endif + return +END SUBROUTINE MYZSWAP +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +SUBROUTINE MYZCOPY(n, zx, incx, zy, incy) +#if defined(__CUDA) +USE cublas +#endif +IMPLICIT NONE + INTEGER :: n, incx, incy + DOUBLE COMPLEX, dimension(*) :: zx, zy +#if defined(__CUDA) + attributes(device) :: zx, zy + CALL cublasZCOPY(n, zx, incx, zy, incy) +#endif + RETURN +END SUBROUTINE MYZCOPY +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +SUBROUTINE MYZAXPY(n, za, zx, incx, zy, incy) +#if defined(__CUDA) +USE cublas +#endif +IMPLICIT NONE + INTEGER :: n, incx, incy + DOUBLE COMPLEX :: za + DOUBLE COMPLEX, dimension(*) :: zx, zy +#if defined(__CUDA) + attributes(device) :: zx, zy + CALL cublasZAXPY(n, za, zx, incx, zy, incy) +#endif + RETURN +END SUBROUTINE MYZAXPY +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +SUBROUTINE MYZDSCAL(n, da, zx, incx) +#if defined(__CUDA) +USE cublas +#endif +IMPLICIT NONE + INTEGER :: n, incx + DOUBLE PRECISION :: da + DOUBLE COMPLEX, dimension(*) :: zx +#if defined(__CUDA) + attributes(device) :: zx + CALL cublasZDSCAL(n, da, zx, incx) +#endif + RETURN +END SUBROUTINE MYZDSCAL +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +SUBROUTINE MYZSCAL(n, za, zx, incx) +#if defined(__CUDA) +USE cublas +#endif +IMPLICIT NONE + INTEGER :: n, incx + DOUBLE COMPLEX :: za + DOUBLE COMPLEX, dimension(*) :: zx +#if defined(__CUDA) + attributes(device) :: zx + CALL cublasZSCAL(n, za, zx, incx) +#endif + RETURN +END SUBROUTINE MYZSCAL +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +SUBROUTINE MYDCOPY(n, x, incx, y, incy) +#if defined(__CUDA) +USE cublas +#endif +IMPLICIT NONE + INTEGER :: n, incx, incy + DOUBLE PRECISION, INTENT(IN) :: x(*) + DOUBLE PRECISION, INTENT(OUT) :: y(*) +#if defined(__CUDA) + attributes(device) :: x, y + call cublasDCOPY(n, x, incx, y, incy) +#endif + RETURN +END SUBROUTINE MYDCOPY +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +SUBROUTINE MYDAXPY(n, a, x, incx, y, incy) +#if defined(__CUDA) +USE cublas +#endif +IMPLICIT NONE + INTEGER :: n, incx, incy + DOUBLE PRECISION, INTENT(IN) :: a + DOUBLE PRECISION, INTENT(IN) :: x(*) + DOUBLE PRECISION, INTENT(OUT) :: y(*) +#if defined(__CUDA) + attributes(device) :: x, y + call cublasDAXPY( n, a, x, incx, y, incy) +#endif + RETURN +END SUBROUTINE MYDAXPY +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +SUBROUTINE MYDSCAL(n, a, x, incx) +#if defined(__CUDA) +USE cublas +#endif +IMPLICIT NONE + integer :: n, incx + DOUBLE PRECISION :: a + DOUBLE PRECISION, dimension(*) :: x +#if defined(__CUDA) + attributes(device) :: x + call cublasDSCAL(n, a, x, incx) +#endif + RETURN +END SUBROUTINE MYDSCAL +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +SUBROUTINE MYDSWAP(n, dx, incx, dy, incy) +#if defined(__CUDA) +USE cublas +#endif +implicit none + integer :: n, incx, incy + REAL(8), dimension(*) :: dx, dy +#if defined(__CUDA) + attributes(device) :: dx, dy + CALL cublasDSWAP(n, dx, incx, dy, incy) +#endif + return +END SUBROUTINE MYDSWAP +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! From d10d24c3a18bccf145df4d117faffb0d667ac713 Mon Sep 17 00:00:00 2001 From: Paolo Giannozzi Date: Wed, 21 Aug 2024 09:24:51 +0200 Subject: [PATCH 6/7] Useless BLAS interfaces removed --- Modules/CMakeLists.txt | 3 +-- Modules/Makefile | 1 - Modules/cuda_subroutines.f90 | 39 ------------------------------------ PW/src/newd_gpu.f90 | 4 ---- 4 files changed, 1 insertion(+), 46 deletions(-) delete mode 100644 Modules/cuda_subroutines.f90 diff --git a/Modules/CMakeLists.txt b/Modules/CMakeLists.txt index 719ca4c96..781f569e7 100644 --- a/Modules/CMakeLists.txt +++ b/Modules/CMakeLists.txt @@ -202,8 +202,7 @@ set(src_modules w1gauss.f90 deviatoric.f90 # GPU - random_numbers_gpu.f90 - cuda_subroutines.f90) + random_numbers_gpu.f90) qe_enable_cuda_fortran("${src_modules}") qe_add_library(qe_modules ${src_modules}) diff --git a/Modules/Makefile b/Modules/Makefile index 6185d78d3..21706653c 100644 --- a/Modules/Makefile +++ b/Modules/Makefile @@ -221,7 +221,6 @@ sockets.o # GPU versions of modules MODULES += \ - cuda_subroutines.o \ random_numbers_gpu.o TLDEPS= libfox libla libfft libutil libmbd librxc libupf diff --git a/Modules/cuda_subroutines.f90 b/Modules/cuda_subroutines.f90 deleted file mode 100644 index 1055b263f..000000000 --- a/Modules/cuda_subroutines.f90 +++ /dev/null @@ -1,39 +0,0 @@ - !---------------------------------------------- - ! ... this file contains a number of subroutines optionally interfaced - ! ... to cublas - !---------------------------------------------- - -SUBROUTINE cudaDGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) -#if defined(__CUDA) - use cudafor - use cublas -#endif - implicit none - DOUBLE PRECISION :: ALPHA,BETA - INTEGER :: INCX,INCY,LDA,M,N - CHARACTER :: TRANS - DOUBLE PRECISION :: A(LDA,*),X(*),Y(*) -#if defined(__CUDA) - attributes(device) :: A, X, Y -#endif - ! - call DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) - ! -END SUBROUTINE cudaDGEMV - -SUBROUTINE cudaDGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA ) -#if defined(__CUDA) - use cudafor - use cublas -#endif -! .. Scalar Arguments .. - DOUBLE PRECISION :: ALPHA - INTEGER :: INCX, INCY, LDA, M, N -! .. Array Arguments .. - DOUBLE PRECISION :: A( LDA, * ), X( * ), Y( * ) -#if defined(__CUDA) - attributes(device) :: A, X, Y -#endif - CALL DGER ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA ) - -END SUBROUTINE cudaDGER diff --git a/PW/src/newd_gpu.f90 b/PW/src/newd_gpu.f90 index b66d91841..a1ffce761 100644 --- a/PW/src/newd_gpu.f90 +++ b/PW/src/newd_gpu.f90 @@ -19,10 +19,6 @@ SUBROUTINE newq_gpu(vr,deeq_d,skip_vltot) #if defined(__CUDA) USE cudafor USE cublas -#else -#define cublasZgemm Zgemm -#define cublasDGEMM Dgemm -#define cudaDGER Dger #endif USE kinds, ONLY : DP USE ions_base, ONLY : nat, ntyp => nsp, ityp From b50321f43fb901b2f562c38d122bb9f0e4e8fcc1 Mon Sep 17 00:00:00 2001 From: Paolo Giannozzi Date: Wed, 21 Aug 2024 10:34:41 +0200 Subject: [PATCH 7/7] Yet another cuda_dger removed --- LR_Modules/ch_psi_all.f90 | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/LR_Modules/ch_psi_all.f90 b/LR_Modules/ch_psi_all.f90 index 5549bf252..09ea8f0eb 100644 --- a/LR_Modules/ch_psi_all.f90 +++ b/LR_Modules/ch_psi_all.f90 @@ -257,9 +257,6 @@ CONTAINS USE realus, ONLY : real_space, invfft_orbital_gamma, & fwfft_orbital_gamma, calbec_rs_gamma, s_psir_gamma use gvect, only : gstart -#if defined(__CUDA) - USE cublas -#endif IMPLICIT NONE INTEGER :: m_start, m_end ,ntemp @@ -280,15 +277,10 @@ CONTAINS CALL errore('ch_psi_all', 'non collin in gamma point not implemented',1) ENDIF -#if defined(__CUDA) !$acc host_data use_device(spsi, ps, evc) - CALL DGEMM( 'C', 'N', nbnd, m, 2*n, 2.D0,evc, 2*npwx*npol, spsi, 2*npwx*npol, 0.D0, ps, nbnd ) - if(gstart==2) CALL gpu_DGER(nbnd, m, -1.0_DP, evc, 2*npwx, spsi, 2*npwx, ps, nbnd ) + CALL MYDGEMM( 'C', 'N', nbnd, m, 2*n, 2.D0,evc, 2*npwx*npol, spsi, 2*npwx*npol, 0.D0, ps, nbnd ) + if(gstart==2) CALL MYDGER(nbnd, m, -1.0_DP, evc, 2*npwx, spsi, 2*npwx, ps, nbnd ) !$acc end host_data -#else - CALL DGEMM( 'C', 'N', nbnd, m, 2*n, 2.D0,evc, 2*npwx*npol, spsi, 2*npwx*npol, 0.D0, ps, nbnd ) - if(gstart==2) CALL DGER(nbnd, m, -1.0_DP, evc, 2*npwx, spsi, 2*npwx, ps, nbnd ) -#endif !$acc kernels ps (:,:) = ps(:,:) * alpha_pv hpsi (:,:) = (0.d0, 0.d0) @@ -297,7 +289,7 @@ CONTAINS CALL mp_sum ( ps, intra_bgrp_comm ) !$acc end host_data !$acc host_data use_device(hpsi, ps, evc) - CALL DGEMM ('N', 'N', 2*n, m, ntemp , 1.d0 , evc, 2*npwx, ps, nbnd, 1.d0 , hpsi, 2*npwx) + CALL MYDGEMM ('N', 'N', 2*n, m, ntemp , 1.d0 , evc, 2*npwx, ps, nbnd, 1.d0 , hpsi, 2*npwx) !$acc end host_data !$acc kernels spsi(:,:) = hpsi(:,:)