Solve merge conflicts

This commit is contained in:
Marc Torrent 2023-12-17 21:32:32 +01:00
parent 420ed8ab0f
commit bd2354497d
32 changed files with 139 additions and 180 deletions

View File

@ -366,6 +366,9 @@
/* Define to 1 if you want to activate support for OpenMP GPU offload. */
#cmakedefine HAVE_OPENMP_OFFLOAD @HAVE_OPENMP_OFFLOAD@
/* Define to 1 if you want to activate support for OpenMP GPU offload. */
#cmakedefine HAVE_OPENMP_OFFLOAD @HAVE_OPENMP_OFFLOAD_DATASTRUCTURE@
/* Set to 1 if OpenMP has a working implementation of COLLAPSE. */
#cmakedefine HAVE_OMP_COLLAPSE @HAVE_OMP_COLLAPSE@

View File

@ -485,6 +485,8 @@ inline const char* _ConvertSMVer2ArchName(int major, int minor) {
{0x75, "Turing"},
{0x80, "Ampere"},
{0x86, "Ampere"},
{0x89, "AdaLovelace"},
{0x90, "Hopper"},
{-1, "Graphics Device"}};
int index = 0;

View File

@ -499,6 +499,8 @@ inline const char* _ConvertSMVer2ArchName(int major, int minor) {
{0x75, "Turing"},
{0x80, "Ampere"},
{0x86, "Ampere"},
{0x89, "AdaLovelace"},
{0x90, "Hopper"},
{-1, "Graphics Device"}};
int index = 0;

View File

@ -418,39 +418,24 @@ end subroutine Get_Mem_Dev
#if defined HAVE_GPU
! Closing YAKL and Kokkos if opened
if (gpu_option==ABI_GPU_KOKKOS .or. gpu_option==ABI_GPU_LEGACY) then
if (gpu_option==ABI_GPU_KOKKOS) then
#ifdef HAVE_YAKL
call gator_finalize()
write(std_out,*)'yakl gator finalized'
#endif
#ifdef HAVE_KOKKOS
! finalize kokkos
call kokkos_finalize()
write(std_out,*)'kokkos finalized'
#endif
!kokkos_finalize already reset GPU context
!if (gpu_option/=ABI_GPU_KOKKOS) call unset_dev()
end if
! kokkos_finalize already reset GPU context
!if (gpu_option/=ABI_GPU_KOKKOS) call unset_dev()
! Closing YAKL and Kokkos if opened
if (gpu_option==ABI_GPU_KOKKOS .or. gpu_option==ABI_GPU_LEGACY) then
#ifdef HAVE_YAKL
call gator_finalize()
write(std_out,*)'yakl gator finalized'
#endif
#ifdef HAVE_KOKKOS
! finalize kokkos
call kokkos_finalize()
write(std_out,*)'kokkos finalized'
#endif
if (gpu_option==ABI_GPU_LEGACY) then
call unset_dev()
end if
! kokkos_finalize already reset GPU context
!if (gpu_option/=ABI_GPU_KOKKOS) call unset_dev()
#endif
end subroutine unsetdevice_cuda
!!***

View File

@ -583,7 +583,7 @@ subroutine nctk_test_mpiio(print_warning)
!FIXME nf90create fails when using NVHPC
! This might be due to my environment, maybe not, need to investigate this...
#ifndef FC_NVHPC
!!#ifndef FC_NVHPC
#ifdef HAVE_NETCDF_MPI
if (xmpi_comm_rank(xmpi_world) == master) then
! Try to open a file with hdf5.
@ -619,7 +619,7 @@ subroutine nctk_test_mpiio(print_warning)
ABI_WARNING(msg)
end if
#endif
#endif
!!#endif
#ifdef HAVE_NETCDF_DEFAULT
if (.not. nctk_has_mpiio) then

View File

@ -5002,7 +5002,7 @@ end subroutine abi_gpu_xcopy_2z
!! b
!!
!! SIDE EFFECTS
!! WARNING! : this routine is a dummy one when HAVE_GPU_CUDA is not enabled
!! WARNING! : this routine is a dummy one when HAVE_GPU is not enabled
!! the correct one is in 17_toolbox/gpu_linalg.cu
!!
!! SOURCE

View File

@ -204,11 +204,6 @@ end subroutine elpa_func_uninit
!! INPUTS
!! [blacs_ctx]= -- optional -- Blacs context
!! [gpu]= -- optional -- Flag (0 or 1): use GPU version (currently only NVidia)
!! na=Order of matrix A
!! nblk=Blocksize of cyclic distribution, must be the same in both directions!
!! local_nrows=Leading dimension of A
!! local_ncols=Local columns of matrixes A and Q (eigenvectors)
!! nev=Number of eigenvalues needed.
!!
!! SIDE EFFECTS
!! elpa_hdl(type<elpa_hdl_t>)= ELPA handle
@ -223,6 +218,7 @@ subroutine elpa_func_allocate(elpa_hdl,gpu,blacs_ctx)
!Local variables-------------------------------
integer :: err,l_gpu,l_blacs_ctx
logical :: gpu_debug_mode=.false.
character(len=10) :: varname
! *********************************************************************
@ -261,49 +257,30 @@ subroutine elpa_func_allocate(elpa_hdl,gpu,blacs_ctx)
ABI_ERROR("You seem to use an old version of ELPA ( < 2021.x ) which only supports NVIDIA GPUs.")
#endif
end if
call elpa_func_error_handler(err_code=err,err_msg='Error when enabling GPU on ELPA')
if (err==ELPA_OK) call elpa_hdl%elpa%set("debug",1,err)
call elpa_func_error_handler(err_code=err,err_msg='Error when enabling debug on ELPA')
end if
#else
if (err==0.and.l_gpu==1) elpa_hdl%gpu=l_gpu
#endif
call elpa_func_error_handler(err_code=err,err_varname=varname)
call elpa_func_error_handler(err_code=err,err_msg='Error when enabling GPU on ELPA')
!if (err==ELPA_OK) call elpa_hdl%elpa%set("debug",1,err)
!call elpa_func_error_handler(err_code=err,err_msg='Error when enabling debug on ELPA')
if (gpu_debug_mode) then
if (err==ELPA_OK) call elpa_hdl%elpa%set("debug",1,err)
call elpa_func_error_handler(err_code=err,err_msg='Error when enabling debug on ELPA')
end if
end if
#else
if (err==0.and.l_gpu==1) elpa_hdl%gpu=l_gpu
if (err==0.and.l_gpu==1) then
elpa_hdl%gpu=l_gpu
if (gpu_debug_mode) elpa_hdl%debug=1
end if
#endif
call elpa_func_error_handler(err_code=err,err_varname=varname)
if (present(blacs_ctx)) then
if (err==ELPA_OK) call elpa_hdl%elpa%set("blacs_context",int(blacs_ctx,kind=c_int),err)
call elpa_func_error_handler(err_code=err,err_varname=varname)
end if
elpa_hdl%is_allocated=.true.
! Setting matrix size
call elpa_func_set_matrix(elpa_hdl,na,nblk,local_nrows,local_ncols,nev)
if (present(blacs_ctx)) then
if (err==ELPA_OK) call elpa_hdl%elpa%set("blacs_context",int(blacs_ctx,kind=c_int),err)
end if
! Proper ELPA setup
err = elpa_hdl%elpa%setup()
call elpa_func_error_handler(err_code=err,err_msg='Error during ELPA setup')
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
call nvtxEndRange()
#endif
end subroutine elpa_func_allocate
!!***
@ -453,6 +430,12 @@ subroutine elpa_func_get_communicators(elpa_hdl,mpi_comm_parent,process_row,proc
varname='process_col'
call elpa_hdl%elpa%set(trim(varname),process_col,err)
end if
if (err==ELPA_OK) then
varname=''
err = elpa_hdl%elpa%setup()
call elpa_func_error_handler(err_code=err,err_msg='Error during ELPA setup')
endif
#else
elpa_hdl%mpi_comm_parent=mpi_comm_parent
elpa_hdl%process_row=process_row
@ -467,10 +450,13 @@ subroutine elpa_func_get_communicators(elpa_hdl,mpi_comm_parent,process_row,proc
!ELPA-LEGACY-2017
err=elpa_get_communicators(mpi_comm_parent,process_row,process_col,elpa_hdl%elpa_comm_rows,elpa_hdl%elpa_comm_cols)
#endif
#endif
call elpa_func_error_handler(err_code=err,err_msg='Error in elpa_get_communicators',err_varname=varname)
elpa_hdl%is_allocated=.true.
end subroutine elpa_func_get_communicators
!!***

View File

@ -2687,7 +2687,7 @@ subroutine compute_eigen_problem(processor, matrix, results, eigen, comm, istwf_
call elpa_func_allocate(elpa_hdl,gpu=use_gpu_elpa_)
call elpa_func_set_matrix(elpa_hdl,matrix%sizeb_global(1),matrix%sizeb_blocs(1),nev__,&
& matrix%sizeb_local(1),matrix%sizeb_local(2),nev__,gpu=use_gpu)
& matrix%sizeb_local(1),matrix%sizeb_local(2))
call elpa_func_get_communicators(elpa_hdl,processor%comm,processor%coords(1),processor%coords(2))
if (istwf_k/=2) then
@ -2912,6 +2912,10 @@ subroutine solve_gevp_complex(na,nev,na_rows,na_cols,nblk,a,b,ev,z,tmp1,tmp2, &
if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa
#endif
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
call nvtxStartRange("solve_gevp_complex",12)
#endif
! Allocate ELPA handle
call elpa_func_allocate(elpa_hdl,blacs_ctx=sc_desc(CTXT_),gpu=use_gpu_elpa_)
call elpa_func_set_matrix(elpa_hdl,na,nblk,nev,na_rows,na_cols)
@ -2920,6 +2924,7 @@ subroutine solve_gevp_complex(na,nev,na_rows,na_cols,nblk,a,b,ev,z,tmp1,tmp2, &
call elpa_func_solve_gevp_2stage(elpa_hdl,a,b,z,ev,nev)
call elpa_func_deallocate(elpa_hdl)
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
call nvtxEndRange()
#endif
@ -2957,6 +2962,10 @@ subroutine solve_gevp_real(na,nev,na_rows,na_cols,nblk,a,b,ev,z,tmp1,tmp2, &
if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa
#endif
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
call nvtxStartRange("solve_gevp_real",12)
#endif
! Allocate ELPA handle
call elpa_func_allocate(elpa_hdl,blacs_ctx=sc_desc(CTXT_),gpu=use_gpu_elpa_)
call elpa_func_set_matrix(elpa_hdl,na,nblk,nev,na_rows,na_cols)
@ -2999,6 +3008,10 @@ subroutine solve_gevp_real(na,nev,na_rows,na_cols,nblk,a,b,ev,z,tmp1,tmp2, &
call elpa_func_deallocate(elpa_hdl)
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
call nvtxEndRange()
#endif
end subroutine solve_gevp_real
!!***
#endif
@ -3050,6 +3063,7 @@ subroutine compute_generalized_eigen_problem(processor,matrix1,matrix2,results,e
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
call nvtxStartRange("slk_compute_generalized_eigen", 10)
#endif
nev__ = matrix1%sizeb_global(2); if (present(nev)) nev__ = nev
use_gpu_elpa__ = 0
#ifdef HAVE_LINALG_ELPA
@ -3082,6 +3096,7 @@ subroutine compute_generalized_eigen_problem(processor,matrix1,matrix2,results,e
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
call nvtxEndRange()
#endif
#else
!Arguments ------------------------------------
class(processor_scalapack),intent(in) :: processor
@ -3292,6 +3307,10 @@ subroutine compute_eigen1(comm,processor,cplex,nbli_global,nbco_global,matrix,ve
if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa
#endif
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
call nvtxStartRange("slk_compute_eigen1", 7)
#endif
! ================================
! INITIALISATION SCALAPACK MATRIX
! ================================
@ -3358,6 +3377,10 @@ subroutine compute_eigen1(comm,processor,cplex,nbli_global,nbco_global,matrix,ve
ABI_SFREE(z_tmp_evec)
ABI_SFREE(r_tmp_evec)
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
call nvtxEndRange()
#endif
#ifndef HAVE_LINALG_ELPA
ABI_UNUSED(use_gpu_elpa)
#endif
@ -3423,6 +3446,10 @@ subroutine compute_eigen2(comm,processor,cplex,nbli_global,nbco_global,matrix1,m
if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa
#endif
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
call nvtxStartRange("slk_compute_eigen2", 7)
#endif
! ================================
! INITIALISATION SCALAPACK MATRIX
! ================================
@ -3495,13 +3522,14 @@ subroutine compute_eigen2(comm,processor,cplex,nbli_global,nbco_global,matrix1,m
call sca_matrix2%free()
call sca_matrix3%free()
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
call nvtxEndRange()
#endif
#ifndef HAVE_LINALG_ELPA
ABI_UNUSED(use_gpu_elpa)
#endif
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
call nvtxEndRange()
#endif
end subroutine compute_eigen2
!!***

View File

@ -29,10 +29,14 @@ cpp_options:
- DEV_YP_DEBUG_PSP
- DEV_YP_VDWXC
- FC_ABSOFT
- FC_CRAY
- FC_FLANG
- FC_GNU
- FC_IBM
- FC_INTEL
- FC_LLVM
- FC_NAG
- FC_NVHPC
- FC_PGI
- FFT_PRECISION
- GPU_FOUR_HEADER_H
@ -88,6 +92,7 @@ cpp_options:
- HAVE_GPU_CUDA_DP
- HAVE_GPU_CUDA_SP
- HAVE_GPU_CUDA_TM
- HAVE_GPU_HIP
- HAVE_GPU_MPI
- HAVE_GPU_SERIAL
- HAVE_GW_DPC
@ -154,6 +159,8 @@ cpp_options:
- HAVE_NUMPY
- HAVE_OMP_COLLAPSE
- HAVE_OPENMP
- HAVE_OPENMP_OFFLOAD
- HAVE_OPENMP_OFFLOAD_DATASTRUCTURE
- HAVE_OS_LINUX
- HAVE_OS_MACOSX
- HAVE_OS_WINDOWS

View File

@ -124,7 +124,6 @@ type, public :: dataset_type
integer :: diismemory
integer :: dipdip = 1
integer :: dipquad = 1
integer :: distribute_gemm_nonlop = 0
integer :: dmatpuopt
integer :: dmatudiag
integer :: dmft_dc
@ -202,7 +201,6 @@ type, public :: dataset_type
integer :: ga_algor
integer :: ga_fitness
integer :: ga_n_rules
integer :: gemm_nonlop_split_size = 1
integer :: getcell = 0
integer :: getddb = 0
integer :: getdvdb = 0
@ -623,7 +621,6 @@ type, public :: dataset_type
integer :: tl_nprccg
!U
integer :: ucrpa
integer :: use_gpu_openmp_threads
integer :: usedmatpu
integer :: usedmft
integer :: useexexch
@ -1445,7 +1442,6 @@ type(dataset_type) function dtset_copy(dtin) result(dtout)
dtout%delayperm = dtin%delayperm
dtout%diismemory = dtin%diismemory
dtout%dipquad = dtin%dipquad
dtout%distribute_gemm_nonlop = dtin%distribute_gemm_nonlop
dtout%dmatpuopt = dtin%dmatpuopt
dtout%dmatudiag = dtin%dmatudiag
dtout%dmft_dc = dtin%dmft_dc
@ -1578,7 +1574,6 @@ type(dataset_type) function dtset_copy(dtin) result(dtout)
dtout%ga_algor = dtin%ga_algor
dtout%ga_fitness = dtin%ga_fitness
dtout%ga_n_rules = dtin%ga_n_rules
dtout%gemm_nonlop_split_size = dtin%gemm_nonlop_split_size
dtout%getbseig = dtin%getbseig
dtout%getbsreso = dtin%getbsreso
dtout%getbscoup = dtin%getbscoup
@ -1994,7 +1989,6 @@ type(dataset_type) function dtset_copy(dtin) result(dtout)
dtout%tim1rev = dtin%tim1rev
dtout%timopt = dtin%timopt
dtout%use_gemm_nonlop = dtin%use_gemm_nonlop
dtout%use_gpu_openmp_threads = dtin%use_gpu_openmp_threads
dtout%useextfpmd = dtin%useextfpmd
dtout%use_yaml = dtin%use_yaml ! This variable activates the Yaml output for testing purposes
! It will be removed when Yaml output enters production.
@ -3317,7 +3311,7 @@ subroutine chkvars(string)
list_vars=trim(list_vars)//' delayperm densfor_pred densty dfield'
list_vars=trim(list_vars)//' dfpt_sciss diecut diegap dielam dielng diemac'
list_vars=trim(list_vars)//' diemix diemixmag diismemory'
list_vars=trim(list_vars)//' dilatmx dipdip dipquad dipdip_prt dipdip_range distribute_gemm_nonlop'
list_vars=trim(list_vars)//' dilatmx dipdip dipquad dipdip_prt dipdip_range'
list_vars=trim(list_vars)//' dmatpawu dmatpuopt dmatudiag'
list_vars=trim(list_vars)//' dmftbandi dmftbandf dmftctqmc_basis'
list_vars=trim(list_vars)//' dmftctqmc_check dmftctqmc_correl dmftctqmc_gmove'
@ -3365,7 +3359,7 @@ subroutine chkvars(string)
list_vars=trim(list_vars)//' f4of2_sla f6of2_sla'
!G
list_vars=trim(list_vars)//' ga_algor ga_fitness ga_n_rules ga_opt_percent ga_rules'
list_vars=trim(list_vars)//' gemm_nonlop_split_size genafm getbscoup getbseig getbsreso getcell'
list_vars=trim(list_vars)//' genafm getbscoup getbseig getbsreso getcell'
list_vars=trim(list_vars)//' getddb getddb_filepath getden_filepath getddk'
list_vars=trim(list_vars)//' getdelfd getdkdk getdkde getden getkden getdvdb getdvdb_filepath'
list_vars=trim(list_vars)//' getefmas getkerange_filepath getgam_eig2nkq'
@ -3539,7 +3533,6 @@ subroutine chkvars(string)
list_vars=trim(list_vars)//' userra userrb userrc userrd userre'
list_vars=trim(list_vars)//' usewvl usexcnhat useylm use_gemm_nonlop'
list_vars=trim(list_vars)//' use_slk useextfpmd use_yaml'
list_vars=trim(list_vars)//' use_slk useextfpmd use_yaml'
list_vars=trim(list_vars)//' use_oldchi'
!V
list_vars=trim(list_vars)//' vaclst vacnum vacuum vacwidth vcutgeo'

View File

@ -605,7 +605,7 @@ module m_xgTransposer
call nvtxStartRange("MPI_AllToAllV", 8)
#endif
if( xgTransposer%gou_option == ABI_GPU_KOKKOS) then
if( xgTransposer%gpu_option == ABI_GPU_KOKKOS) then
#if defined(HAVE_GPU_CUDA) && defined(HAVE_KOKKOS) && defined(HAVE_YAKL)
call timab(tim_all2allv,1,tsec)
@ -621,7 +621,6 @@ module m_xgTransposer
recvbuf(:,:) = recvbuf_mpi(:,:)
ABI_FREE(recvbuf_mpi)
#endif
else

View File

@ -205,13 +205,12 @@ subroutine ompgpu_fourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,ist
cfft_size = 2*n1*n2*n3*ndat
#ifdef HAVE_GPU_CUDA
#if defined HAVE_GPU_CUDA
byte_count=sizeof(work_gpu)
!$OMP TARGET DATA USE_DEVICE_PTR(work_gpu)
call gpu_memset(c_loc(work_gpu), 0, byte_count)
!$OMP END TARGET DATA
#endif
#ifdef HAVE_GPU_HIP
#elif defined HAVE_GPU_HIP
!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(i1,i2,i3) MAP(to:work_gpu)
do i3=1,n3*ndat
do i2=1,n2
@ -266,12 +265,11 @@ subroutine ompgpu_fourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,ist
i1=kg_kin(1,ipw); if(i1<0)i1=i1+n1;
i2=kg_kin(2,ipw); if(i2<0)i2=i2+n2;
i3=kg_kin(3,ipw); if(i3<0)i3=i3+n3;
#ifdef HAVE_GPU_CUDA
#if defined HAVE_GPU_CUDA
i1inv = modulo(shift_inv1 - i1, n1) + 1
i2inv = modulo(shift_inv2 - i2, n2) + 1
i3inv = modulo(shift_inv3 - i3, n3) + 1
#endif
#ifdef HAVE_GPU_HIP
#elif defined HAVE_GPU_HIP
i1inv = (shift_inv1-i1) - ( ((shift_inv1-i1)/n1) * n1 ) + 1
i2inv = (shift_inv2-i2) - ( ((shift_inv2-i2)/n2) * n2 ) + 1
i3inv = (shift_inv3-i3) - ( ((shift_inv3-i3)/n3) * n3 ) + 1

View File

@ -1513,7 +1513,7 @@ subroutine timana(mpi_enreg,natom,nband,ndtset,nfft,nkpt,npwtot,nsppol,timopt)
percent_limit=0.5_dp
if (timopt<0) percent_limit=0.0001_dp
if (timopt<0) percent_limit=tol12
!if (timopt<0) percent_limit=tol12
!In case there is parallelism, report times for node 0
!if (me==0 .and. nproc>1) then
@ -1591,10 +1591,10 @@ subroutine timana(mpi_enreg,natom,nband,ndtset,nfft,nkpt,npwtot,nsppol,timopt)
end if
!Now, gather all information
!call xmpi_sum(times,spaceworld,ierr)
!call xmpi_sum(ncount,spaceworld,ierr)
!call xmpi_sum(ftimes,spaceworld,ierr)
!call xmpi_sum(nflops,spaceworld,ierr)
call xmpi_sum(times,spaceworld,ierr)
call xmpi_sum(ncount,spaceworld,ierr)
call xmpi_sum(ftimes,spaceworld,ierr)
call xmpi_sum(nflops,spaceworld,ierr)
if (me==0) then ! Only the world master writes

View File

@ -2295,7 +2295,6 @@ subroutine indefo(dtsets, ndtset_alloc, nprocs)
dtsets(idtset)%dielam=half
dtsets(idtset)%diismemory=8
dtsets(idtset)%dilatmx=one
dtsets(idtset)%distribute_gemm_nonlop=0
dtsets(idtset)%dmatpuopt=2
if (size(dtsets(idtset)%dmatpawu,4)>0) dtsets(idtset)%dmatpawu=-10._dp
dtsets(idtset)%dmatudiag=0

View File

@ -257,10 +257,10 @@ SUBROUTINE BathOperatoroffdiag_init(op, flavors, samples, beta, iTech,opt_nondia
FREEIF(op%F)
MALLOC(op%F,(1:op%sizeHybrid+1,1:flavors,1:flavors))
DT_FREEIF(op%tails)
DT_MALLOC(op%tails, (1:op%flavors))
DT_MALLOC(op%tails,(1:op%flavors))
op%tails=0
DT_FREEIF(op%Fshift)
DT_MALLOC(op%Fshift, (1:op%flavors+1))
DT_MALLOC(op%Fshift,(1:op%flavors+1))
op%Fshift=0
CALL Vector_init(op%R,100*op%flavors)

View File

@ -643,7 +643,7 @@ SUBROUTINE Ctqmc_allocateAll(this)
this%measDE = 0.d0
FREEIF(this%mu)
MALLOC(this%mu, (1:flavors) )
MALLOC(this%mu,(1:flavors) )
this%mu = 0.d0
END SUBROUTINE Ctqmc_allocateAll
!!***

View File

@ -665,7 +665,7 @@ SUBROUTINE Ctqmcoffdiag_allocateAll(op)
op%measDE = 0.d0
FREEIF(op%mu)
MALLOC(op%mu, (1:flavors) )
MALLOC(op%mu,(1:flavors) )
op%mu = 0.d0
FREEIF(op%hybri_limit)
MALLOC(op%hybri_limit, (flavors,flavors) )

View File

@ -8,7 +8,7 @@
!! which leads to excellent CPU efficiency and OpenMP scalability.
!!
!! COPYRIGHT
!! Copyright (C) 2014-2022 ABINIT group (AL)
!! Copyright (C) 2014-2022 ABINIT group (MS)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
@ -360,7 +360,7 @@ contains
end if
if(allocated(temp_realvec_r)) then
!$OMP TARGET EXIT DATA MAP(delete:temp_realvec_r,temp_realvec_i)
!$OMP TARGET EXIT DATA MAP(delete:kwa m_hamiltonian.F90,temp_realvec_i)
ABI_FREE(temp_realvec_r)
ABI_FREE(temp_realvec_i)
end if
@ -800,7 +800,7 @@ contains
real(dp),intent(inout),target :: vectin(2,npwin*nspinor*ndat)
real(dp),intent(inout) :: enlout(nnlout*ndat)
real(dp),intent(out),target :: svectout(:,:)
real(dp),intent(inout),target :: vectout(:,:) !vz_i
real(dp),intent(inout),target :: vectout(:,:)
real(dp),intent(inout),optional, ABI_CONTIGUOUS target :: vectproj(:,:,:)
type(pawcprj_type),intent(inout) :: cprjin(natom,nspinor*((cpopt+5)/5)*ndat)
@ -826,9 +826,7 @@ contains
character(len=500) :: msg
integer(C_SIZE_T) :: byte_count
#ifdef HAVE_GPU_HIP
type(c_ptr) :: vectin_amdcopy
type(c_ptr) :: vectout_amdcopy
type(c_ptr) :: svectout_amdcopy
type(c_ptr) :: vectin_amdcopy,vectout_amdcopy,svectout_amdcopy
#endif
! *************************************************************************

View File

@ -191,7 +191,6 @@ module m_hamiltonian
! Governs the choice of the GPU implementation:
! = 0 ==> do not use GPU
! > 0 ==> see defs_basis.F90 to have the list of possible GPU implementations
! = 666 ==> use openMP GPU implementation of hamiltonian operators
integer :: usecprj
! usecprj= 1 if cprj projected WF are stored in memory

View File

@ -759,6 +759,7 @@ has_fock=.false.
#ifndef HAVE_GPU_HIP
!$OMP TARGET EXIT DATA MAP(delete:work)
#endif
end if ! type_calc
ABI_NVTX_END_RANGE()
@ -768,6 +769,7 @@ has_fock=.false.
!============================================================
! Application of the non-local potential and the Fock potential
!============================================================
ABI_NVTX_START_RANGE(NVTX_GETGHC_NLOCPOT)
if (type_calc==0 .or. type_calc==2) then
signs=2 ; choice=1 ; nnlout=1 ; idir=0 ; tim_nonlop=1

View File

@ -1467,8 +1467,7 @@ subroutine solve_inner_ompgpu(invovl, ham, cplx, mpi_enreg, proj, ndat, sm1proj,
integer :: additional_steps_to_take,idat,iproj,icplx
integer :: Ptsize(3)
#ifdef HAVE_GPU_HIP
type(c_ptr) :: sm1proj_amdcopy
type(c_ptr) :: PtPsm1proj_amdcopy
type(c_ptr) :: sm1proj_amdcopy,PtPsm1proj_amdcopy
#endif
! *************************************************************************

View File

@ -53,7 +53,7 @@ module m_prep_kgb
use m_ompgpu_fourwf
#endif
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
use m_nvtx
#endif
@ -295,7 +295,7 @@ subroutine prep_getghc(cwavef, gs_hamk, gvnlxc, gwavef, swavef, lambda, blocksiz
if(do_transpose) then
call timab(545,3,tsec)
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxStartRange("MPI_AllToAllV", 8)
#endif
if ( ((.not.flag_inv_sym) .and. bandpp==1 .and. mpi_enreg%paral_spinor==0 .and. my_nspinor==2 ).or. &
@ -306,7 +306,7 @@ subroutine prep_getghc(cwavef, gs_hamk, gvnlxc, gwavef, swavef, lambda, blocksiz
call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,&
& recvcountsloc,rdisplsloc,spaceComm,ier)
end if
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxEndRange()
#endif
call timab(545,2,tsec)
@ -527,56 +527,56 @@ subroutine prep_getghc(cwavef, gs_hamk, gvnlxc, gwavef, swavef, lambda, blocksiz
if ( ((.not.flag_inv_sym) .and. bandpp==1 .and. mpi_enreg%paral_spinor==0 .and. my_nspinor==2 ).or. &
& ((.not.flag_inv_sym) .and. bandpp>1) .or. flag_inv_sym ) then
if (sij_opt==1) then
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxStartRange("MPI_AllToAllV", 8)
#endif
call xmpi_alltoallv(swavef_alltoall1,recvcountsloc,rdisplsloc,swavef,&
& sendcountsloc,sdisplsloc,spaceComm,ier)
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxEndRange()
#endif
end if
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxStartRange("MPI_AllToAllV", 8)
#endif
if (.not.local_gvnlxc) call xmpi_alltoallv(gvnlxc_alltoall1,recvcountsloc,rdisplsloc,gvnlxc,&
& sendcountsloc,sdisplsloc,spaceComm,ier)
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxEndRange()
#endif
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxStartRange("MPI_AllToAllV", 8)
#endif
call xmpi_alltoallv(gwavef_alltoall1,recvcountsloc,rdisplsloc,gwavef,&
& sendcountsloc,sdisplsloc,spaceComm,ier)
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxEndRange()
#endif
else
if (sij_opt==1) then
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxStartRange("MPI_AllToAllV", 8)
#endif
call xmpi_alltoallv(swavef_alltoall2,recvcountsloc,rdisplsloc,swavef,&
& sendcountsloc,sdisplsloc,spaceComm,ier)
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxEndRange()
#endif
end if
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxStartRange("MPI_AllToAllV", 8)
#endif
if (.not.local_gvnlxc) call xmpi_alltoallv(gvnlxc_alltoall2,recvcountsloc,rdisplsloc,gvnlxc,&
& sendcountsloc,sdisplsloc,spaceComm,ier)
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxEndRange()
#endif
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxStartRange("MPI_AllToAllV", 8)
#endif
call xmpi_alltoallv(gwavef_alltoall2,recvcountsloc,rdisplsloc,gwavef,&
& sendcountsloc,sdisplsloc,spaceComm,ier)
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxEndRange()
#endif
end if
@ -860,7 +860,7 @@ subroutine prep_nonlop(choice,cpopt,cwaveprj,enlout_block,hamk,idir,lambdablock,
if(do_transpose) then
call timab(581,1,tsec)
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxStartRange("MPI_AllToAllV", 8)
#endif
if (bandpp/=1 .or. (bandpp==1 .and. mpi_enreg%paral_spinor==0.and.nspinortot==2)) then
@ -892,7 +892,7 @@ subroutine prep_nonlop(choice,cpopt,cwaveprj,enlout_block,hamk,idir,lambdablock,
call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,&
& recvcountsloc,rdisplsloc,spaceComm,ier)
end if
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxEndRange()
#endif
call timab(581,2,tsec)
@ -1000,43 +1000,43 @@ subroutine prep_nonlop(choice,cpopt,cwaveprj,enlout_block,hamk,idir,lambdablock,
call timab(581,1,tsec)
if(bandpp/=1 .or. (bandpp==1 .and. mpi_enreg%paral_spinor==0.and.nspinortot==2))then
if (paw_opt/=3) then
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxStartRange("MPI_AllToAllV", 8)
#endif
call xmpi_alltoallv(gvnlc_alltoall1,recvcountsloc,rdisplsloc,gvnlc,&
& sendcountsloc,sdisplsloc,spaceComm,ier)
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxEndRange()
#endif
end if
if (paw_opt==3.or.paw_opt==4) then
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxStartRange("MPI_AllToAllV", 8)
#endif
call xmpi_alltoallv(gsc_alltoall1,recvcountsloc,rdisplsloc,gsc,&
& sendcountsloc,sdisplsloc,spaceComm,ier)
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxEndRange()
#endif
end if
else
if (paw_opt/=3) then
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxStartRange("MPI_AllToAllV", 8)
#endif
call xmpi_alltoallv(gvnlc_alltoall2,recvcountsloc,rdisplsloc,gvnlc,&
& sendcountsloc,sdisplsloc,spaceComm,ier)
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxEndRange()
#endif
end if
if (paw_opt==3.or.paw_opt==4) then
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxStartRange("MPI_AllToAllV", 8)
#endif
call xmpi_alltoallv(gsc_alltoall2,recvcountsloc,rdisplsloc,gsc,&
& sendcountsloc,sdisplsloc,spaceComm,ier)
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxEndRange()
#endif
end if
@ -1282,7 +1282,7 @@ subroutine prep_fourwf(rhoaug,blocksize,cwavef,wfraug,iblock,istwf_k,mgfft,&
sdisplsloc(:)=sdispls(:)*2
call timab(547,1,tsec)
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxStartRange("MPI_AllToAllV", 8)
#endif
#if defined HAVE_GPU && defined HAVE_YAKL
@ -1305,7 +1305,7 @@ subroutine prep_fourwf(rhoaug,blocksize,cwavef,wfraug,iblock,istwf_k,mgfft,&
call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,&
& recvcountsloc,rdisplsloc,spaceComm,ier)
#endif
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
call nvtxEndRange()
#endif
call timab(547,2,tsec)

View File

@ -975,13 +975,6 @@ subroutine forstrnps(cg,cprj,ecut,ecutsm,effmass_free,eigen,electronpositron,foc
gs_hamk%ucvol, gs_hamk%ffnl_k, gs_hamk%ph3d_k, gs_hamk%kpt_k, &
gs_hamk%kg_k, gs_hamk%kpg_k, &
compute_grad_strain=(stress_needed>0),compute_grad_atom=(optfor>0))
!!FIXME signs==1 not handled in CUDA GEMM nonlop
!else if ( gpu_option /= ABI_GPU_LEGACY) then
! call make_gemm_nonlop_ompgpu(my_ikpt,gs_hamk%npw_fft_k,gs_hamk%lmnmax, &
! gs_hamk%ntypat, gs_hamk%indlmn, gs_hamk%nattyp, gs_hamk%istwf_k, &
! gs_hamk%ucvol, gs_hamk%ffnl_k, gs_hamk%ph3d_k, gs_hamk%kpt_k, &
! gs_hamk%kg_k, gs_hamk%kpg_k, &
! compute_grad_strain=(stress_needed>0),compute_grad_atom=(optfor>0))
else if ( gpu_option == ABI_GPU_OPENMP) then
call make_gemm_nonlop_ompgpu(my_ikpt,gs_hamk%npw_fft_k,gs_hamk%lmnmax, &
gs_hamk%ntypat, gs_hamk%indlmn, gs_hamk%nattyp, gs_hamk%istwf_k, &
@ -989,15 +982,6 @@ subroutine forstrnps(cg,cprj,ecut,ecutsm,effmass_free,eigen,electronpositron,foc
gs_hamk%kg_k, gs_hamk%kpg_k, &
compute_grad_strain=(stress_needed>0),compute_grad_atom=(optfor>0))
end if
else
ABI_ERROR("istwfk > 2 is not handled with OpenMP GPU offload mode !")
end if
call make_gemm_nonlop_ompgpu(my_ikpt,gs_hamk%npw_fft_k,gs_hamk%lmnmax, &
gs_hamk%ntypat, gs_hamk%indlmn, gs_hamk%nattyp, gs_hamk%istwf_k, &
gs_hamk%ucvol, gs_hamk%ffnl_k, gs_hamk%ph3d_k, gs_hamk%kpt_k, &
gs_hamk%kg_k, gs_hamk%kpg_k, &
compute_grad_strain=(stress_needed>0),compute_grad_atom=(optfor>0))
end if
end if
! Loop over (blocks of) bands; accumulate forces and/or stresses

View File

@ -180,6 +180,10 @@ subroutine mkrho(cg,dtset,gprimd,irrzon,kg,mcg,mpi_enreg,npwarr,occ,paw_dmft,phn
!arrays
integer,allocatable :: gbound(:,:)
logical :: locc_test,nspinor1TreatedByThisProc,nspinor2TreatedByThisProc
real(dp),allocatable :: occ_diag(:),cwavef_rot(:,:,:,:)
#if defined HAVE_GPUL
real(dp),allocatable :: weight_t(:) ! only allocated and used when use_gpu_cuda = 1
#endif
#if defined HAVE_GPU && defined HAVE_YAKL
integer(int32),ABI_CONTIGUOUS pointer :: kg_k(:,:) => null()
real(real64) :: dummy(2,1) = reshape( (/0.0, 0.0/), shape(dummy))

View File

@ -3718,7 +3718,7 @@ subroutine wfd_change_ngfft(Wfd, Cryst, Psps, new_ngfft)
! Recalculate FFT tables.
! Calculate the FFT index of $ R^{-1} (r-\tau) $ used to symmetrize u_Rk.
ABI_REMALLOC(Wfd%irottb, (Wfd%nfftot,Cryst%nsym) )
ABI_REMALLOC(Wfd%irottb, (Wfd%nfftot,Cryst%nsym))
call rotate_FFT_mesh(Cryst%nsym,Cryst%symrel,Cryst%tnons,Wfd%ngfft,Wfd%irottb,iscompatibleFFT)
if (.not. iscompatibleFFT) then

View File

@ -6036,7 +6036,7 @@ subroutine ddb_to_dtset(comm, dtset, filename, psps)
ABI_REMALLOC(dtset%spinat, (3,dtset%natom))
dtset%spinat(:,:) = ddb_hdr%spinat(1:3,1:ddb_hdr%matom)
ABI_REMALLOC(dtset%xred_orig, (3,dtset%natom,mxnimage) )
ABI_REMALLOC(dtset%xred_orig, (3,dtset%natom,mxnimage))
dtset%xred_orig(:,:,1) = ddb_hdr%xred(1:3,1:ddb_hdr%matom)
ABI_REMALLOC(dtset%ziontypat, (dtset%ntypat))

View File

@ -278,12 +278,6 @@ subroutine chebfiwf2(cg,dtset,eig,enl_out,gs_hamk,kinpw,mpi_enreg,&
type(pawcprj_type) :: cprj_dum(gs_hamk%natom,1)
#if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL)
! other
integer(kind=c_size_t) :: l_pcon_size_bytes
#endif
#if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL)
! other
integer(kind=c_size_t) :: l_pcon_size_bytes
#endif
@ -581,7 +575,6 @@ subroutine getghc_gsc1(X,AX,BX,transposer)
! ABI_MALLOC(l_gvnlxc,(2,blockdim*spacedim))
!end if
call multithreaded_getghc(l_cpopt,cg,cprj_dum,ghc,gsc,&
l_gs_hamk,l_gvnlxc,eval,l_mpi_enreg,blockdim,l_prtvol,l_sij_opt,l_tim_getghc,0)
@ -590,14 +583,6 @@ subroutine getghc_gsc1(X,AX,BX,transposer)
call gpu_device_synchronize()
#endif
#if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL)
!if (chebfi%gpu_option==ABI_GPU_KOKKOS) then
call gpu_device_synchronize()
!end if
#endif
!Scale cg, ghc, gsc
if ( l_istwf == 2 ) then
call xgBlock_scale(X ,sqrt2,1,gpu_option=l_gs_hamk%gpu_option)
@ -783,7 +768,6 @@ subroutine getBm1X(X,Bm1X,transposer)
!cwaveprj_next is dummy
if(gemm_nonlop_use_gemm) then
ABI_MALLOC(cwaveprj_next, (1,1))
else
else
ABI_MALLOC(cwaveprj_next, (l_gs_hamk%natom,l_nspinor*blockdim))
call pawcprj_alloc(cwaveprj_next,0,l_gs_hamk%dimcprj)
@ -859,11 +843,11 @@ subroutine getBm1X(X,Bm1X,transposer)
end if
end if
end if
if (l_paw) then
if (l_useria /= 121212) then
ABI_FREE(cwaveprj_next)
end if
ABI_FREE(cwaveprj_next)
end if
ABI_NVTX_END_RANGE()
end subroutine getBm1X

View File

@ -122,7 +122,7 @@ subroutine lobpcgwf2(cg,dtset,eig,occ,enl_out,gs_hamk,isppol,ikpt,inonsc,istep,k
! Important things for NC
integer,parameter :: choice=1, paw_opt=0, signs=1
type(pawcprj_type) :: cprj_dum(gs_hamk%natom,1)
type(pawcprj_type) :: cprj_dum(1,1)
integer :: iband, shift
real(dp) :: gsc_dummy(0,0)
real(dp), allocatable :: l_gvnlxc(:,:)
@ -355,7 +355,7 @@ end subroutine lobpcgwf2
type(xgBlock_t), intent(inout) :: BX
integer :: blockdim
integer :: spacedim
type(pawcprj_type) :: cprj_dum(l_gs_hamk%natom,1)
type(pawcprj_type) :: cprj_dum(1,1)
double precision :: dum
double precision, parameter :: inv_sqrt2 = 1/sqrt2
double precision, pointer :: cg(:,:)

View File

@ -1031,15 +1031,6 @@ subroutine vtorho(afford,atindx,atindx1,cg,compch_fft,cprj,cpus,dbl_nnsclo,&
gs_hamk%ph3d_k,gs_hamk%kpt_k,gs_hamk%kg_k,gs_hamk%kpg_k, &
compute_grad_atom=(optforces>0))
end if
else
ABI_ERROR("istwfk > 2 is not handled with OpenMP GPU offload mode !")
end if
call make_gemm_nonlop_ompgpu(my_ikpt,gs_hamk%npw_fft_k,gs_hamk%lmnmax, &
gs_hamk%ntypat, gs_hamk%indlmn, gs_hamk%nattyp, gs_hamk%istwf_k, &
gs_hamk%ucvol, gs_hamk%ffnl_k, &
gs_hamk%ph3d_k,gs_hamk%kpt_k,gs_hamk%kg_k,gs_hamk%kpg_k, &
compute_grad_atom=(optforces>0))
end if
end if
end if

View File

@ -596,7 +596,7 @@ subroutine outscfcv(atindx1,cg,compch_fft,compch_sph,cprj,dimcprj,dmatpawu,dtfil
! Output of the GSR file (except when we are inside mover)
#ifdef HAVE_NETCDF
#if 0
#ifndef FC_CRAY
if (me == master .and. dtset%prtgsr == 1 .and. dtset%usewvl == 0) then
!.and. (dtset%ionmov /= 0 .or. dtset%optcell /= 0)) then
fname = strcat(dtfil%filnam_ds(4), "_GSR.nc")

View File

@ -456,9 +456,6 @@ subroutine gstate(args_gs,acell,codvsn,cpui,dtfil,dtset,iexit,initialized,&
else if(dtset%gpu_option == ABI_GPU_DISABLED) then
call init_gemm_nonlop(dtset%nkpt)
end if
else if(dtset%gpu_option == ABI_GPU_DISABLED) then
call init_gemm_nonlop(dtset%nkpt)
end if
end if
gemm_nonlop_is_distributed = .false.
@ -1812,8 +1809,6 @@ subroutine gstate(args_gs,acell,codvsn,cpui,dtfil,dtset,iexit,initialized,&
call destroy_gemm_nonlop(dtset%nkpt)
else if(dtset%gpu_option==ABI_GPU_DISABLED) then
call destroy_gemm_nonlop(dtset%nkpt)
else if(dtset%gpu_option==ABI_GPU_DISABLED) then
call destroy_gemm_nonlop(dtset%nkpt)
end if
gemm_nonlop_use_gemm = .false.
end if

View File

@ -68,8 +68,9 @@
#include "abi_common.h"
! nvtx related macro definition
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
#include "nvtx_macros.h"
#endif
program abinit
@ -385,7 +386,7 @@ program abinit
end if
#endif
#ifdef HAVE_GPU_MARKERS
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
NVTX_INIT(use_nvtx)
#endif