diff --git a/config.h.cmake b/config.h.cmake index e4ccc95d13..3aad924d9d 100644 --- a/config.h.cmake +++ b/config.h.cmake @@ -366,6 +366,9 @@ /* Define to 1 if you want to activate support for OpenMP GPU offload. */ #cmakedefine HAVE_OPENMP_OFFLOAD @HAVE_OPENMP_OFFLOAD@ +/* Define to 1 if you want to activate support for OpenMP GPU offload. */ +#cmakedefine HAVE_OPENMP_OFFLOAD @HAVE_OPENMP_OFFLOAD_DATASTRUCTURE@ + /* Set to 1 if OpenMP has a working implementation of COLLAPSE. */ #cmakedefine HAVE_OMP_COLLAPSE @HAVE_OMP_COLLAPSE@ diff --git a/shared/common/src/17_gpu_toolbox/cuda_api_error_check.h b/shared/common/src/17_gpu_toolbox/cuda_api_error_check.h index 663db7b938..756b68995f 100644 --- a/shared/common/src/17_gpu_toolbox/cuda_api_error_check.h +++ b/shared/common/src/17_gpu_toolbox/cuda_api_error_check.h @@ -485,6 +485,8 @@ inline const char* _ConvertSMVer2ArchName(int major, int minor) { {0x75, "Turing"}, {0x80, "Ampere"}, {0x86, "Ampere"}, + {0x89, "AdaLovelace"}, + {0x90, "Hopper"}, {-1, "Graphics Device"}}; int index = 0; diff --git a/shared/common/src/17_gpu_toolbox/hip_api_error_check.h b/shared/common/src/17_gpu_toolbox/hip_api_error_check.h index 4a2de57fae..1cb6916801 100644 --- a/shared/common/src/17_gpu_toolbox/hip_api_error_check.h +++ b/shared/common/src/17_gpu_toolbox/hip_api_error_check.h @@ -499,6 +499,8 @@ inline const char* _ConvertSMVer2ArchName(int major, int minor) { {0x75, "Turing"}, {0x80, "Ampere"}, {0x86, "Ampere"}, + {0x89, "AdaLovelace"}, + {0x90, "Hopper"}, {-1, "Graphics Device"}}; int index = 0; diff --git a/shared/common/src/17_gpu_toolbox/m_initcuda.F90 b/shared/common/src/17_gpu_toolbox/m_initcuda.F90 index 222176e780..a595509961 100644 --- a/shared/common/src/17_gpu_toolbox/m_initcuda.F90 +++ b/shared/common/src/17_gpu_toolbox/m_initcuda.F90 @@ -418,39 +418,24 @@ end subroutine Get_Mem_Dev #if defined HAVE_GPU ! Closing YAKL and Kokkos if opened - if (gpu_option==ABI_GPU_KOKKOS .or. gpu_option==ABI_GPU_LEGACY) then + if (gpu_option==ABI_GPU_KOKKOS) then #ifdef HAVE_YAKL call gator_finalize() write(std_out,*)'yakl gator finalized' #endif - #ifdef HAVE_KOKKOS ! finalize kokkos call kokkos_finalize() write(std_out,*)'kokkos finalized' #endif + !kokkos_finalize already reset GPU context + !if (gpu_option/=ABI_GPU_KOKKOS) call unset_dev() end if - ! kokkos_finalize already reset GPU context - !if (gpu_option/=ABI_GPU_KOKKOS) call unset_dev() - - ! Closing YAKL and Kokkos if opened - if (gpu_option==ABI_GPU_KOKKOS .or. gpu_option==ABI_GPU_LEGACY) then -#ifdef HAVE_YAKL - call gator_finalize() - write(std_out,*)'yakl gator finalized' -#endif - -#ifdef HAVE_KOKKOS - ! finalize kokkos - call kokkos_finalize() - write(std_out,*)'kokkos finalized' -#endif + if (gpu_option==ABI_GPU_LEGACY) then + call unset_dev() end if - ! kokkos_finalize already reset GPU context - !if (gpu_option/=ABI_GPU_KOKKOS) call unset_dev() - #endif end subroutine unsetdevice_cuda !!*** diff --git a/shared/common/src/27_toolbox_oop/m_nctk.F90 b/shared/common/src/27_toolbox_oop/m_nctk.F90 index ec576b3049..854b0a3c3b 100644 --- a/shared/common/src/27_toolbox_oop/m_nctk.F90 +++ b/shared/common/src/27_toolbox_oop/m_nctk.F90 @@ -583,7 +583,7 @@ subroutine nctk_test_mpiio(print_warning) !FIXME nf90create fails when using NVHPC ! This might be due to my environment, maybe not, need to investigate this... -#ifndef FC_NVHPC +!!#ifndef FC_NVHPC #ifdef HAVE_NETCDF_MPI if (xmpi_comm_rank(xmpi_world) == master) then ! Try to open a file with hdf5. @@ -619,7 +619,7 @@ subroutine nctk_test_mpiio(print_warning) ABI_WARNING(msg) end if #endif -#endif +!!#endif #ifdef HAVE_NETCDF_DEFAULT if (.not. nctk_has_mpiio) then diff --git a/shared/common/src/28_numeric_noabirule/abi_gpu_linalg.f90 b/shared/common/src/28_numeric_noabirule/abi_gpu_linalg.f90 index e6acf272ae..9d4b8fe1b0 100644 --- a/shared/common/src/28_numeric_noabirule/abi_gpu_linalg.f90 +++ b/shared/common/src/28_numeric_noabirule/abi_gpu_linalg.f90 @@ -5002,7 +5002,7 @@ end subroutine abi_gpu_xcopy_2z !! b !! !! SIDE EFFECTS -!! WARNING! : this routine is a dummy one when HAVE_GPU_CUDA is not enabled +!! WARNING! : this routine is a dummy one when HAVE_GPU is not enabled !! the correct one is in 17_toolbox/gpu_linalg.cu !! !! SOURCE diff --git a/shared/common/src/28_numeric_noabirule/m_elpa.F90 b/shared/common/src/28_numeric_noabirule/m_elpa.F90 index 9a1a6d3b51..9f0637d20e 100644 --- a/shared/common/src/28_numeric_noabirule/m_elpa.F90 +++ b/shared/common/src/28_numeric_noabirule/m_elpa.F90 @@ -204,11 +204,6 @@ end subroutine elpa_func_uninit !! INPUTS !! [blacs_ctx]= -- optional -- Blacs context !! [gpu]= -- optional -- Flag (0 or 1): use GPU version (currently only NVidia) -!! na=Order of matrix A -!! nblk=Blocksize of cyclic distribution, must be the same in both directions! -!! local_nrows=Leading dimension of A -!! local_ncols=Local columns of matrixes A and Q (eigenvectors) -!! nev=Number of eigenvalues needed. !! !! SIDE EFFECTS !! elpa_hdl(type)= ELPA handle @@ -223,6 +218,7 @@ subroutine elpa_func_allocate(elpa_hdl,gpu,blacs_ctx) !Local variables------------------------------- integer :: err,l_gpu,l_blacs_ctx + logical :: gpu_debug_mode=.false. character(len=10) :: varname ! ********************************************************************* @@ -261,49 +257,30 @@ subroutine elpa_func_allocate(elpa_hdl,gpu,blacs_ctx) ABI_ERROR("You seem to use an old version of ELPA ( < 2021.x ) which only supports NVIDIA GPUs.") #endif end if - - call elpa_func_error_handler(err_code=err,err_msg='Error when enabling GPU on ELPA') - if (err==ELPA_OK) call elpa_hdl%elpa%set("debug",1,err) - call elpa_func_error_handler(err_code=err,err_msg='Error when enabling debug on ELPA') - end if -#else - if (err==0.and.l_gpu==1) elpa_hdl%gpu=l_gpu -#endif - - call elpa_func_error_handler(err_code=err,err_varname=varname) - call elpa_func_error_handler(err_code=err,err_msg='Error when enabling GPU on ELPA') - !if (err==ELPA_OK) call elpa_hdl%elpa%set("debug",1,err) - !call elpa_func_error_handler(err_code=err,err_msg='Error when enabling debug on ELPA') + if (gpu_debug_mode) then + if (err==ELPA_OK) call elpa_hdl%elpa%set("debug",1,err) + call elpa_func_error_handler(err_code=err,err_msg='Error when enabling debug on ELPA') + end if end if #else - if (err==0.and.l_gpu==1) elpa_hdl%gpu=l_gpu + if (err==0.and.l_gpu==1) then + elpa_hdl%gpu=l_gpu + if (gpu_debug_mode) elpa_hdl%debug=1 + end if #endif - call elpa_func_error_handler(err_code=err,err_varname=varname) - if (present(blacs_ctx)) then if (err==ELPA_OK) call elpa_hdl%elpa%set("blacs_context",int(blacs_ctx,kind=c_int),err) + call elpa_func_error_handler(err_code=err,err_varname=varname) end if - elpa_hdl%is_allocated=.true. - - ! Setting matrix size - call elpa_func_set_matrix(elpa_hdl,na,nblk,local_nrows,local_ncols,nev) - - if (present(blacs_ctx)) then - if (err==ELPA_OK) call elpa_hdl%elpa%set("blacs_context",int(blacs_ctx,kind=c_int),err) - end if - - ! Proper ELPA setup - err = elpa_hdl%elpa%setup() - call elpa_func_error_handler(err_code=err,err_msg='Error during ELPA setup') - #if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) call nvtxEndRange() #endif + end subroutine elpa_func_allocate !!*** @@ -453,6 +430,12 @@ subroutine elpa_func_get_communicators(elpa_hdl,mpi_comm_parent,process_row,proc varname='process_col' call elpa_hdl%elpa%set(trim(varname),process_col,err) end if + if (err==ELPA_OK) then + varname='' + err = elpa_hdl%elpa%setup() + call elpa_func_error_handler(err_code=err,err_msg='Error during ELPA setup') + endif + #else elpa_hdl%mpi_comm_parent=mpi_comm_parent elpa_hdl%process_row=process_row @@ -467,10 +450,13 @@ subroutine elpa_func_get_communicators(elpa_hdl,mpi_comm_parent,process_row,proc !ELPA-LEGACY-2017 err=elpa_get_communicators(mpi_comm_parent,process_row,process_col,elpa_hdl%elpa_comm_rows,elpa_hdl%elpa_comm_cols) #endif + #endif call elpa_func_error_handler(err_code=err,err_msg='Error in elpa_get_communicators',err_varname=varname) + elpa_hdl%is_allocated=.true. + end subroutine elpa_func_get_communicators !!*** diff --git a/shared/common/src/28_numeric_noabirule/m_slk.F90 b/shared/common/src/28_numeric_noabirule/m_slk.F90 index ca36a80694..097171d315 100644 --- a/shared/common/src/28_numeric_noabirule/m_slk.F90 +++ b/shared/common/src/28_numeric_noabirule/m_slk.F90 @@ -2687,7 +2687,7 @@ subroutine compute_eigen_problem(processor, matrix, results, eigen, comm, istwf_ call elpa_func_allocate(elpa_hdl,gpu=use_gpu_elpa_) call elpa_func_set_matrix(elpa_hdl,matrix%sizeb_global(1),matrix%sizeb_blocs(1),nev__,& -& matrix%sizeb_local(1),matrix%sizeb_local(2),nev__,gpu=use_gpu) +& matrix%sizeb_local(1),matrix%sizeb_local(2)) call elpa_func_get_communicators(elpa_hdl,processor%comm,processor%coords(1),processor%coords(2)) if (istwf_k/=2) then @@ -2912,6 +2912,10 @@ subroutine solve_gevp_complex(na,nev,na_rows,na_cols,nblk,a,b,ev,z,tmp1,tmp2, & if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa #endif +#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) + call nvtxStartRange("solve_gevp_complex",12) +#endif + ! Allocate ELPA handle call elpa_func_allocate(elpa_hdl,blacs_ctx=sc_desc(CTXT_),gpu=use_gpu_elpa_) call elpa_func_set_matrix(elpa_hdl,na,nblk,nev,na_rows,na_cols) @@ -2920,6 +2924,7 @@ subroutine solve_gevp_complex(na,nev,na_rows,na_cols,nblk,a,b,ev,z,tmp1,tmp2, & call elpa_func_solve_gevp_2stage(elpa_hdl,a,b,z,ev,nev) call elpa_func_deallocate(elpa_hdl) + #if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) call nvtxEndRange() #endif @@ -2957,6 +2962,10 @@ subroutine solve_gevp_real(na,nev,na_rows,na_cols,nblk,a,b,ev,z,tmp1,tmp2, & if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa #endif +#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) + call nvtxStartRange("solve_gevp_real",12) +#endif + ! Allocate ELPA handle call elpa_func_allocate(elpa_hdl,blacs_ctx=sc_desc(CTXT_),gpu=use_gpu_elpa_) call elpa_func_set_matrix(elpa_hdl,na,nblk,nev,na_rows,na_cols) @@ -2999,6 +3008,10 @@ subroutine solve_gevp_real(na,nev,na_rows,na_cols,nblk,a,b,ev,z,tmp1,tmp2, & call elpa_func_deallocate(elpa_hdl) +#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) + call nvtxEndRange() +#endif + end subroutine solve_gevp_real !!*** #endif @@ -3050,6 +3063,7 @@ subroutine compute_generalized_eigen_problem(processor,matrix1,matrix2,results,e #if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) call nvtxStartRange("slk_compute_generalized_eigen", 10) #endif + nev__ = matrix1%sizeb_global(2); if (present(nev)) nev__ = nev use_gpu_elpa__ = 0 #ifdef HAVE_LINALG_ELPA @@ -3082,6 +3096,7 @@ subroutine compute_generalized_eigen_problem(processor,matrix1,matrix2,results,e #if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) call nvtxEndRange() #endif + #else !Arguments ------------------------------------ class(processor_scalapack),intent(in) :: processor @@ -3292,6 +3307,10 @@ subroutine compute_eigen1(comm,processor,cplex,nbli_global,nbco_global,matrix,ve if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa #endif +#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) + call nvtxStartRange("slk_compute_eigen1", 7) +#endif + ! ================================ ! INITIALISATION SCALAPACK MATRIX ! ================================ @@ -3358,6 +3377,10 @@ subroutine compute_eigen1(comm,processor,cplex,nbli_global,nbco_global,matrix,ve ABI_SFREE(z_tmp_evec) ABI_SFREE(r_tmp_evec) +#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) + call nvtxEndRange() +#endif + #ifndef HAVE_LINALG_ELPA ABI_UNUSED(use_gpu_elpa) #endif @@ -3423,6 +3446,10 @@ subroutine compute_eigen2(comm,processor,cplex,nbli_global,nbco_global,matrix1,m if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa #endif +#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) + call nvtxStartRange("slk_compute_eigen2", 7) +#endif + ! ================================ ! INITIALISATION SCALAPACK MATRIX ! ================================ @@ -3495,13 +3522,14 @@ subroutine compute_eigen2(comm,processor,cplex,nbli_global,nbco_global,matrix1,m call sca_matrix2%free() call sca_matrix3%free() +#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) + call nvtxEndRange() +#endif + #ifndef HAVE_LINALG_ELPA ABI_UNUSED(use_gpu_elpa) #endif -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) - call nvtxEndRange() -#endif end subroutine compute_eigen2 !!*** diff --git a/split/abinit_cpp_options.yml b/split/abinit_cpp_options.yml index 41532d4d79..633f184b76 100644 --- a/split/abinit_cpp_options.yml +++ b/split/abinit_cpp_options.yml @@ -29,10 +29,14 @@ cpp_options: - DEV_YP_DEBUG_PSP - DEV_YP_VDWXC - FC_ABSOFT +- FC_CRAY +- FC_FLANG - FC_GNU - FC_IBM - FC_INTEL +- FC_LLVM - FC_NAG +- FC_NVHPC - FC_PGI - FFT_PRECISION - GPU_FOUR_HEADER_H @@ -88,6 +92,7 @@ cpp_options: - HAVE_GPU_CUDA_DP - HAVE_GPU_CUDA_SP - HAVE_GPU_CUDA_TM +- HAVE_GPU_HIP - HAVE_GPU_MPI - HAVE_GPU_SERIAL - HAVE_GW_DPC @@ -154,6 +159,8 @@ cpp_options: - HAVE_NUMPY - HAVE_OMP_COLLAPSE - HAVE_OPENMP +- HAVE_OPENMP_OFFLOAD +- HAVE_OPENMP_OFFLOAD_DATASTRUCTURE - HAVE_OS_LINUX - HAVE_OS_MACOSX - HAVE_OS_WINDOWS diff --git a/src/44_abitypes_defs/m_dtset.F90 b/src/44_abitypes_defs/m_dtset.F90 index a3bd8e340f..e8ea1bfeab 100644 --- a/src/44_abitypes_defs/m_dtset.F90 +++ b/src/44_abitypes_defs/m_dtset.F90 @@ -124,7 +124,6 @@ type, public :: dataset_type integer :: diismemory integer :: dipdip = 1 integer :: dipquad = 1 - integer :: distribute_gemm_nonlop = 0 integer :: dmatpuopt integer :: dmatudiag integer :: dmft_dc @@ -202,7 +201,6 @@ type, public :: dataset_type integer :: ga_algor integer :: ga_fitness integer :: ga_n_rules - integer :: gemm_nonlop_split_size = 1 integer :: getcell = 0 integer :: getddb = 0 integer :: getdvdb = 0 @@ -623,7 +621,6 @@ type, public :: dataset_type integer :: tl_nprccg !U integer :: ucrpa - integer :: use_gpu_openmp_threads integer :: usedmatpu integer :: usedmft integer :: useexexch @@ -1445,7 +1442,6 @@ type(dataset_type) function dtset_copy(dtin) result(dtout) dtout%delayperm = dtin%delayperm dtout%diismemory = dtin%diismemory dtout%dipquad = dtin%dipquad - dtout%distribute_gemm_nonlop = dtin%distribute_gemm_nonlop dtout%dmatpuopt = dtin%dmatpuopt dtout%dmatudiag = dtin%dmatudiag dtout%dmft_dc = dtin%dmft_dc @@ -1578,7 +1574,6 @@ type(dataset_type) function dtset_copy(dtin) result(dtout) dtout%ga_algor = dtin%ga_algor dtout%ga_fitness = dtin%ga_fitness dtout%ga_n_rules = dtin%ga_n_rules - dtout%gemm_nonlop_split_size = dtin%gemm_nonlop_split_size dtout%getbseig = dtin%getbseig dtout%getbsreso = dtin%getbsreso dtout%getbscoup = dtin%getbscoup @@ -1994,7 +1989,6 @@ type(dataset_type) function dtset_copy(dtin) result(dtout) dtout%tim1rev = dtin%tim1rev dtout%timopt = dtin%timopt dtout%use_gemm_nonlop = dtin%use_gemm_nonlop - dtout%use_gpu_openmp_threads = dtin%use_gpu_openmp_threads dtout%useextfpmd = dtin%useextfpmd dtout%use_yaml = dtin%use_yaml ! This variable activates the Yaml output for testing purposes ! It will be removed when Yaml output enters production. @@ -3317,7 +3311,7 @@ subroutine chkvars(string) list_vars=trim(list_vars)//' delayperm densfor_pred densty dfield' list_vars=trim(list_vars)//' dfpt_sciss diecut diegap dielam dielng diemac' list_vars=trim(list_vars)//' diemix diemixmag diismemory' - list_vars=trim(list_vars)//' dilatmx dipdip dipquad dipdip_prt dipdip_range distribute_gemm_nonlop' + list_vars=trim(list_vars)//' dilatmx dipdip dipquad dipdip_prt dipdip_range' list_vars=trim(list_vars)//' dmatpawu dmatpuopt dmatudiag' list_vars=trim(list_vars)//' dmftbandi dmftbandf dmftctqmc_basis' list_vars=trim(list_vars)//' dmftctqmc_check dmftctqmc_correl dmftctqmc_gmove' @@ -3365,7 +3359,7 @@ subroutine chkvars(string) list_vars=trim(list_vars)//' f4of2_sla f6of2_sla' !G list_vars=trim(list_vars)//' ga_algor ga_fitness ga_n_rules ga_opt_percent ga_rules' - list_vars=trim(list_vars)//' gemm_nonlop_split_size genafm getbscoup getbseig getbsreso getcell' + list_vars=trim(list_vars)//' genafm getbscoup getbseig getbsreso getcell' list_vars=trim(list_vars)//' getddb getddb_filepath getden_filepath getddk' list_vars=trim(list_vars)//' getdelfd getdkdk getdkde getden getkden getdvdb getdvdb_filepath' list_vars=trim(list_vars)//' getefmas getkerange_filepath getgam_eig2nkq' @@ -3539,7 +3533,6 @@ subroutine chkvars(string) list_vars=trim(list_vars)//' userra userrb userrc userrd userre' list_vars=trim(list_vars)//' usewvl usexcnhat useylm use_gemm_nonlop' list_vars=trim(list_vars)//' use_slk useextfpmd use_yaml' - list_vars=trim(list_vars)//' use_slk useextfpmd use_yaml' list_vars=trim(list_vars)//' use_oldchi' !V list_vars=trim(list_vars)//' vaclst vacnum vacuum vacwidth vcutgeo' diff --git a/src/45_xgTools/m_xgTransposer.F90 b/src/45_xgTools/m_xgTransposer.F90 index 134ba97a09..6a446ee956 100644 --- a/src/45_xgTools/m_xgTransposer.F90 +++ b/src/45_xgTools/m_xgTransposer.F90 @@ -605,7 +605,7 @@ module m_xgTransposer call nvtxStartRange("MPI_AllToAllV", 8) #endif - if( xgTransposer%gou_option == ABI_GPU_KOKKOS) then + if( xgTransposer%gpu_option == ABI_GPU_KOKKOS) then #if defined(HAVE_GPU_CUDA) && defined(HAVE_KOKKOS) && defined(HAVE_YAKL) call timab(tim_all2allv,1,tsec) @@ -621,7 +621,6 @@ module m_xgTransposer recvbuf(:,:) = recvbuf_mpi(:,:) ABI_FREE(recvbuf_mpi) - #endif else diff --git a/src/46_ghc_omp/m_ompgpu_fourwf.F90 b/src/46_ghc_omp/m_ompgpu_fourwf.F90 index f4c01456c0..2f6024e777 100644 --- a/src/46_ghc_omp/m_ompgpu_fourwf.F90 +++ b/src/46_ghc_omp/m_ompgpu_fourwf.F90 @@ -205,13 +205,12 @@ subroutine ompgpu_fourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,ist cfft_size = 2*n1*n2*n3*ndat -#ifdef HAVE_GPU_CUDA +#if defined HAVE_GPU_CUDA byte_count=sizeof(work_gpu) !$OMP TARGET DATA USE_DEVICE_PTR(work_gpu) call gpu_memset(c_loc(work_gpu), 0, byte_count) !$OMP END TARGET DATA -#endif -#ifdef HAVE_GPU_HIP +#elif defined HAVE_GPU_HIP !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(i1,i2,i3) MAP(to:work_gpu) do i3=1,n3*ndat do i2=1,n2 @@ -266,12 +265,11 @@ subroutine ompgpu_fourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,ist i1=kg_kin(1,ipw); if(i1<0)i1=i1+n1; i2=kg_kin(2,ipw); if(i2<0)i2=i2+n2; i3=kg_kin(3,ipw); if(i3<0)i3=i3+n3; -#ifdef HAVE_GPU_CUDA +#if defined HAVE_GPU_CUDA i1inv = modulo(shift_inv1 - i1, n1) + 1 i2inv = modulo(shift_inv2 - i2, n2) + 1 i3inv = modulo(shift_inv3 - i3, n3) + 1 -#endif -#ifdef HAVE_GPU_HIP +#elif defined HAVE_GPU_HIP i1inv = (shift_inv1-i1) - ( ((shift_inv1-i1)/n1) * n1 ) + 1 i2inv = (shift_inv2-i2) - ( ((shift_inv2-i2)/n2) * n2 ) + 1 i3inv = (shift_inv3-i3) - ( ((shift_inv3-i3)/n3) * n3 ) + 1 diff --git a/src/55_abiutil/m_timana.F90 b/src/55_abiutil/m_timana.F90 index af4ff7162d..fe0e179b55 100644 --- a/src/55_abiutil/m_timana.F90 +++ b/src/55_abiutil/m_timana.F90 @@ -1513,7 +1513,7 @@ subroutine timana(mpi_enreg,natom,nband,ndtset,nfft,nkpt,npwtot,nsppol,timopt) percent_limit=0.5_dp if (timopt<0) percent_limit=0.0001_dp - if (timopt<0) percent_limit=tol12 + !if (timopt<0) percent_limit=tol12 !In case there is parallelism, report times for node 0 !if (me==0 .and. nproc>1) then @@ -1591,10 +1591,10 @@ subroutine timana(mpi_enreg,natom,nband,ndtset,nfft,nkpt,npwtot,nsppol,timopt) end if !Now, gather all information - !call xmpi_sum(times,spaceworld,ierr) - !call xmpi_sum(ncount,spaceworld,ierr) - !call xmpi_sum(ftimes,spaceworld,ierr) - !call xmpi_sum(nflops,spaceworld,ierr) + call xmpi_sum(times,spaceworld,ierr) + call xmpi_sum(ncount,spaceworld,ierr) + call xmpi_sum(ftimes,spaceworld,ierr) + call xmpi_sum(nflops,spaceworld,ierr) if (me==0) then ! Only the world master writes diff --git a/src/57_iovars/m_invars1.F90 b/src/57_iovars/m_invars1.F90 index 29ca300f98..74912b1458 100644 --- a/src/57_iovars/m_invars1.F90 +++ b/src/57_iovars/m_invars1.F90 @@ -2295,7 +2295,6 @@ subroutine indefo(dtsets, ndtset_alloc, nprocs) dtsets(idtset)%dielam=half dtsets(idtset)%diismemory=8 dtsets(idtset)%dilatmx=one - dtsets(idtset)%distribute_gemm_nonlop=0 dtsets(idtset)%dmatpuopt=2 if (size(dtsets(idtset)%dmatpawu,4)>0) dtsets(idtset)%dmatpawu=-10._dp dtsets(idtset)%dmatudiag=0 diff --git a/src/62_ctqmc/m_BathOperatoroffdiag.F90 b/src/62_ctqmc/m_BathOperatoroffdiag.F90 index c16aff5604..8c18be29db 100644 --- a/src/62_ctqmc/m_BathOperatoroffdiag.F90 +++ b/src/62_ctqmc/m_BathOperatoroffdiag.F90 @@ -257,10 +257,10 @@ SUBROUTINE BathOperatoroffdiag_init(op, flavors, samples, beta, iTech,opt_nondia FREEIF(op%F) MALLOC(op%F,(1:op%sizeHybrid+1,1:flavors,1:flavors)) DT_FREEIF(op%tails) - DT_MALLOC(op%tails, (1:op%flavors)) + DT_MALLOC(op%tails,(1:op%flavors)) op%tails=0 DT_FREEIF(op%Fshift) - DT_MALLOC(op%Fshift, (1:op%flavors+1)) + DT_MALLOC(op%Fshift,(1:op%flavors+1)) op%Fshift=0 CALL Vector_init(op%R,100*op%flavors) diff --git a/src/62_ctqmc/m_Ctqmc.F90 b/src/62_ctqmc/m_Ctqmc.F90 index 4b5dc3b94f..5590c443d4 100644 --- a/src/62_ctqmc/m_Ctqmc.F90 +++ b/src/62_ctqmc/m_Ctqmc.F90 @@ -643,7 +643,7 @@ SUBROUTINE Ctqmc_allocateAll(this) this%measDE = 0.d0 FREEIF(this%mu) - MALLOC(this%mu, (1:flavors) ) + MALLOC(this%mu,(1:flavors) ) this%mu = 0.d0 END SUBROUTINE Ctqmc_allocateAll !!*** diff --git a/src/62_ctqmc/m_Ctqmcoffdiag.F90 b/src/62_ctqmc/m_Ctqmcoffdiag.F90 index 0a44d3e71a..ec00bd9bba 100644 --- a/src/62_ctqmc/m_Ctqmcoffdiag.F90 +++ b/src/62_ctqmc/m_Ctqmcoffdiag.F90 @@ -665,7 +665,7 @@ SUBROUTINE Ctqmcoffdiag_allocateAll(op) op%measDE = 0.d0 FREEIF(op%mu) - MALLOC(op%mu, (1:flavors) ) + MALLOC(op%mu,(1:flavors) ) op%mu = 0.d0 FREEIF(op%hybri_limit) MALLOC(op%hybri_limit, (flavors,flavors) ) diff --git a/src/66_nonlocal/m_gemm_nonlop_ompgpu.F90 b/src/66_nonlocal/m_gemm_nonlop_ompgpu.F90 index e9663e65d6..2e205f4cdd 100644 --- a/src/66_nonlocal/m_gemm_nonlop_ompgpu.F90 +++ b/src/66_nonlocal/m_gemm_nonlop_ompgpu.F90 @@ -8,7 +8,7 @@ !! which leads to excellent CPU efficiency and OpenMP scalability. !! !! COPYRIGHT -!! Copyright (C) 2014-2022 ABINIT group (AL) +!! Copyright (C) 2014-2022 ABINIT group (MS) !! This file is distributed under the terms of the !! GNU General Public License, see ~abinit/COPYING !! or http://www.gnu.org/copyleft/gpl.txt . @@ -360,7 +360,7 @@ contains end if if(allocated(temp_realvec_r)) then - !$OMP TARGET EXIT DATA MAP(delete:temp_realvec_r,temp_realvec_i) + !$OMP TARGET EXIT DATA MAP(delete:kwa m_hamiltonian.F90,temp_realvec_i) ABI_FREE(temp_realvec_r) ABI_FREE(temp_realvec_i) end if @@ -800,7 +800,7 @@ contains real(dp),intent(inout),target :: vectin(2,npwin*nspinor*ndat) real(dp),intent(inout) :: enlout(nnlout*ndat) real(dp),intent(out),target :: svectout(:,:) - real(dp),intent(inout),target :: vectout(:,:) !vz_i + real(dp),intent(inout),target :: vectout(:,:) real(dp),intent(inout),optional, ABI_CONTIGUOUS target :: vectproj(:,:,:) type(pawcprj_type),intent(inout) :: cprjin(natom,nspinor*((cpopt+5)/5)*ndat) @@ -826,9 +826,7 @@ contains character(len=500) :: msg integer(C_SIZE_T) :: byte_count #ifdef HAVE_GPU_HIP - type(c_ptr) :: vectin_amdcopy - type(c_ptr) :: vectout_amdcopy - type(c_ptr) :: svectout_amdcopy + type(c_ptr) :: vectin_amdcopy,vectout_amdcopy,svectout_amdcopy #endif ! ************************************************************************* diff --git a/src/66_nonlocal/m_hamiltonian.F90 b/src/66_nonlocal/m_hamiltonian.F90 index 8d2712da50..7eae81f869 100644 --- a/src/66_nonlocal/m_hamiltonian.F90 +++ b/src/66_nonlocal/m_hamiltonian.F90 @@ -191,7 +191,6 @@ module m_hamiltonian ! Governs the choice of the GPU implementation: ! = 0 ==> do not use GPU ! > 0 ==> see defs_basis.F90 to have the list of possible GPU implementations - ! = 666 ==> use openMP GPU implementation of hamiltonian operators integer :: usecprj ! usecprj= 1 if cprj projected WF are stored in memory diff --git a/src/66_wfs/m_getghc_ompgpu.F90 b/src/66_wfs/m_getghc_ompgpu.F90 index d6c68e6a18..b81da2923a 100644 --- a/src/66_wfs/m_getghc_ompgpu.F90 +++ b/src/66_wfs/m_getghc_ompgpu.F90 @@ -759,6 +759,7 @@ has_fock=.false. #ifndef HAVE_GPU_HIP !$OMP TARGET EXIT DATA MAP(delete:work) #endif + end if ! type_calc ABI_NVTX_END_RANGE() @@ -768,6 +769,7 @@ has_fock=.false. !============================================================ ! Application of the non-local potential and the Fock potential !============================================================ + ABI_NVTX_START_RANGE(NVTX_GETGHC_NLOCPOT) if (type_calc==0 .or. type_calc==2) then signs=2 ; choice=1 ; nnlout=1 ; idir=0 ; tim_nonlop=1 diff --git a/src/66_wfs/m_invovl.F90 b/src/66_wfs/m_invovl.F90 index a0ee451fd7..15340ef27c 100644 --- a/src/66_wfs/m_invovl.F90 +++ b/src/66_wfs/m_invovl.F90 @@ -1467,8 +1467,7 @@ subroutine solve_inner_ompgpu(invovl, ham, cplx, mpi_enreg, proj, ndat, sm1proj, integer :: additional_steps_to_take,idat,iproj,icplx integer :: Ptsize(3) #ifdef HAVE_GPU_HIP - type(c_ptr) :: sm1proj_amdcopy - type(c_ptr) :: PtPsm1proj_amdcopy + type(c_ptr) :: sm1proj_amdcopy,PtPsm1proj_amdcopy #endif ! ************************************************************************* diff --git a/src/66_wfs/m_prep_kgb.F90 b/src/66_wfs/m_prep_kgb.F90 index eb29a4d1a5..0615b6cc09 100644 --- a/src/66_wfs/m_prep_kgb.F90 +++ b/src/66_wfs/m_prep_kgb.F90 @@ -53,7 +53,7 @@ module m_prep_kgb use m_ompgpu_fourwf #endif -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) use m_nvtx #endif @@ -295,7 +295,7 @@ subroutine prep_getghc(cwavef, gs_hamk, gvnlxc, gwavef, swavef, lambda, blocksiz if(do_transpose) then call timab(545,3,tsec) -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxStartRange("MPI_AllToAllV", 8) #endif if ( ((.not.flag_inv_sym) .and. bandpp==1 .and. mpi_enreg%paral_spinor==0 .and. my_nspinor==2 ).or. & @@ -306,7 +306,7 @@ subroutine prep_getghc(cwavef, gs_hamk, gvnlxc, gwavef, swavef, lambda, blocksiz call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,& & recvcountsloc,rdisplsloc,spaceComm,ier) end if -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxEndRange() #endif call timab(545,2,tsec) @@ -527,56 +527,56 @@ subroutine prep_getghc(cwavef, gs_hamk, gvnlxc, gwavef, swavef, lambda, blocksiz if ( ((.not.flag_inv_sym) .and. bandpp==1 .and. mpi_enreg%paral_spinor==0 .and. my_nspinor==2 ).or. & & ((.not.flag_inv_sym) .and. bandpp>1) .or. flag_inv_sym ) then if (sij_opt==1) then -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxStartRange("MPI_AllToAllV", 8) #endif call xmpi_alltoallv(swavef_alltoall1,recvcountsloc,rdisplsloc,swavef,& & sendcountsloc,sdisplsloc,spaceComm,ier) -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxEndRange() #endif end if -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxStartRange("MPI_AllToAllV", 8) #endif if (.not.local_gvnlxc) call xmpi_alltoallv(gvnlxc_alltoall1,recvcountsloc,rdisplsloc,gvnlxc,& & sendcountsloc,sdisplsloc,spaceComm,ier) -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxEndRange() #endif -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxStartRange("MPI_AllToAllV", 8) #endif call xmpi_alltoallv(gwavef_alltoall1,recvcountsloc,rdisplsloc,gwavef,& & sendcountsloc,sdisplsloc,spaceComm,ier) -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxEndRange() #endif else if (sij_opt==1) then -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxStartRange("MPI_AllToAllV", 8) #endif call xmpi_alltoallv(swavef_alltoall2,recvcountsloc,rdisplsloc,swavef,& & sendcountsloc,sdisplsloc,spaceComm,ier) -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxEndRange() #endif end if -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxStartRange("MPI_AllToAllV", 8) #endif if (.not.local_gvnlxc) call xmpi_alltoallv(gvnlxc_alltoall2,recvcountsloc,rdisplsloc,gvnlxc,& & sendcountsloc,sdisplsloc,spaceComm,ier) -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxEndRange() #endif -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxStartRange("MPI_AllToAllV", 8) #endif call xmpi_alltoallv(gwavef_alltoall2,recvcountsloc,rdisplsloc,gwavef,& & sendcountsloc,sdisplsloc,spaceComm,ier) -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxEndRange() #endif end if @@ -860,7 +860,7 @@ subroutine prep_nonlop(choice,cpopt,cwaveprj,enlout_block,hamk,idir,lambdablock, if(do_transpose) then call timab(581,1,tsec) -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxStartRange("MPI_AllToAllV", 8) #endif if (bandpp/=1 .or. (bandpp==1 .and. mpi_enreg%paral_spinor==0.and.nspinortot==2)) then @@ -892,7 +892,7 @@ subroutine prep_nonlop(choice,cpopt,cwaveprj,enlout_block,hamk,idir,lambdablock, call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,& & recvcountsloc,rdisplsloc,spaceComm,ier) end if -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxEndRange() #endif call timab(581,2,tsec) @@ -1000,43 +1000,43 @@ subroutine prep_nonlop(choice,cpopt,cwaveprj,enlout_block,hamk,idir,lambdablock, call timab(581,1,tsec) if(bandpp/=1 .or. (bandpp==1 .and. mpi_enreg%paral_spinor==0.and.nspinortot==2))then if (paw_opt/=3) then -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxStartRange("MPI_AllToAllV", 8) #endif call xmpi_alltoallv(gvnlc_alltoall1,recvcountsloc,rdisplsloc,gvnlc,& & sendcountsloc,sdisplsloc,spaceComm,ier) -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxEndRange() #endif end if if (paw_opt==3.or.paw_opt==4) then -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxStartRange("MPI_AllToAllV", 8) #endif call xmpi_alltoallv(gsc_alltoall1,recvcountsloc,rdisplsloc,gsc,& & sendcountsloc,sdisplsloc,spaceComm,ier) -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxEndRange() #endif end if else if (paw_opt/=3) then -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxStartRange("MPI_AllToAllV", 8) #endif call xmpi_alltoallv(gvnlc_alltoall2,recvcountsloc,rdisplsloc,gvnlc,& & sendcountsloc,sdisplsloc,spaceComm,ier) -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxEndRange() #endif end if if (paw_opt==3.or.paw_opt==4) then -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxStartRange("MPI_AllToAllV", 8) #endif call xmpi_alltoallv(gsc_alltoall2,recvcountsloc,rdisplsloc,gsc,& & sendcountsloc,sdisplsloc,spaceComm,ier) -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxEndRange() #endif end if @@ -1282,7 +1282,7 @@ subroutine prep_fourwf(rhoaug,blocksize,cwavef,wfraug,iblock,istwf_k,mgfft,& sdisplsloc(:)=sdispls(:)*2 call timab(547,1,tsec) -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxStartRange("MPI_AllToAllV", 8) #endif #if defined HAVE_GPU && defined HAVE_YAKL @@ -1305,7 +1305,7 @@ subroutine prep_fourwf(rhoaug,blocksize,cwavef,wfraug,iblock,istwf_k,mgfft,& call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,& & recvcountsloc,rdisplsloc,spaceComm,ier) #endif -#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS) +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) call nvtxEndRange() #endif call timab(547,2,tsec) diff --git a/src/67_common/m_forstr.F90 b/src/67_common/m_forstr.F90 index f45ced98a3..dc610b1909 100644 --- a/src/67_common/m_forstr.F90 +++ b/src/67_common/m_forstr.F90 @@ -975,13 +975,6 @@ subroutine forstrnps(cg,cprj,ecut,ecutsm,effmass_free,eigen,electronpositron,foc gs_hamk%ucvol, gs_hamk%ffnl_k, gs_hamk%ph3d_k, gs_hamk%kpt_k, & gs_hamk%kg_k, gs_hamk%kpg_k, & compute_grad_strain=(stress_needed>0),compute_grad_atom=(optfor>0)) - !!FIXME signs==1 not handled in CUDA GEMM nonlop - !else if ( gpu_option /= ABI_GPU_LEGACY) then - ! call make_gemm_nonlop_ompgpu(my_ikpt,gs_hamk%npw_fft_k,gs_hamk%lmnmax, & - ! gs_hamk%ntypat, gs_hamk%indlmn, gs_hamk%nattyp, gs_hamk%istwf_k, & - ! gs_hamk%ucvol, gs_hamk%ffnl_k, gs_hamk%ph3d_k, gs_hamk%kpt_k, & - ! gs_hamk%kg_k, gs_hamk%kpg_k, & - ! compute_grad_strain=(stress_needed>0),compute_grad_atom=(optfor>0)) else if ( gpu_option == ABI_GPU_OPENMP) then call make_gemm_nonlop_ompgpu(my_ikpt,gs_hamk%npw_fft_k,gs_hamk%lmnmax, & gs_hamk%ntypat, gs_hamk%indlmn, gs_hamk%nattyp, gs_hamk%istwf_k, & @@ -989,15 +982,6 @@ subroutine forstrnps(cg,cprj,ecut,ecutsm,effmass_free,eigen,electronpositron,foc gs_hamk%kg_k, gs_hamk%kpg_k, & compute_grad_strain=(stress_needed>0),compute_grad_atom=(optfor>0)) end if - else - ABI_ERROR("istwfk > 2 is not handled with OpenMP GPU offload mode !") - end if - call make_gemm_nonlop_ompgpu(my_ikpt,gs_hamk%npw_fft_k,gs_hamk%lmnmax, & - gs_hamk%ntypat, gs_hamk%indlmn, gs_hamk%nattyp, gs_hamk%istwf_k, & - gs_hamk%ucvol, gs_hamk%ffnl_k, gs_hamk%ph3d_k, gs_hamk%kpt_k, & - gs_hamk%kg_k, gs_hamk%kpg_k, & - compute_grad_strain=(stress_needed>0),compute_grad_atom=(optfor>0)) - end if end if ! Loop over (blocks of) bands; accumulate forces and/or stresses diff --git a/src/67_common/m_mkrho.F90 b/src/67_common/m_mkrho.F90 index 73903412f5..81e4906113 100644 --- a/src/67_common/m_mkrho.F90 +++ b/src/67_common/m_mkrho.F90 @@ -180,6 +180,10 @@ subroutine mkrho(cg,dtset,gprimd,irrzon,kg,mcg,mpi_enreg,npwarr,occ,paw_dmft,phn !arrays integer,allocatable :: gbound(:,:) logical :: locc_test,nspinor1TreatedByThisProc,nspinor2TreatedByThisProc + real(dp),allocatable :: occ_diag(:),cwavef_rot(:,:,:,:) +#if defined HAVE_GPUL + real(dp),allocatable :: weight_t(:) ! only allocated and used when use_gpu_cuda = 1 +#endif #if defined HAVE_GPU && defined HAVE_YAKL integer(int32),ABI_CONTIGUOUS pointer :: kg_k(:,:) => null() real(real64) :: dummy(2,1) = reshape( (/0.0, 0.0/), shape(dummy)) diff --git a/src/69_wfdesc/m_wfd.F90 b/src/69_wfdesc/m_wfd.F90 index 5fba774605..6e386a52c6 100644 --- a/src/69_wfdesc/m_wfd.F90 +++ b/src/69_wfdesc/m_wfd.F90 @@ -3718,7 +3718,7 @@ subroutine wfd_change_ngfft(Wfd, Cryst, Psps, new_ngfft) ! Recalculate FFT tables. ! Calculate the FFT index of $ R^{-1} (r-\tau) $ used to symmetrize u_Rk. - ABI_REMALLOC(Wfd%irottb, (Wfd%nfftot,Cryst%nsym) ) + ABI_REMALLOC(Wfd%irottb, (Wfd%nfftot,Cryst%nsym)) call rotate_FFT_mesh(Cryst%nsym,Cryst%symrel,Cryst%tnons,Wfd%ngfft,Wfd%irottb,iscompatibleFFT) if (.not. iscompatibleFFT) then diff --git a/src/72_response/m_ddb.F90 b/src/72_response/m_ddb.F90 index d49d1dc2a4..04ae667d0e 100644 --- a/src/72_response/m_ddb.F90 +++ b/src/72_response/m_ddb.F90 @@ -6036,7 +6036,7 @@ subroutine ddb_to_dtset(comm, dtset, filename, psps) ABI_REMALLOC(dtset%spinat, (3,dtset%natom)) dtset%spinat(:,:) = ddb_hdr%spinat(1:3,1:ddb_hdr%matom) - ABI_REMALLOC(dtset%xred_orig, (3,dtset%natom,mxnimage) ) + ABI_REMALLOC(dtset%xred_orig, (3,dtset%natom,mxnimage)) dtset%xred_orig(:,:,1) = ddb_hdr%xred(1:3,1:ddb_hdr%matom) ABI_REMALLOC(dtset%ziontypat, (dtset%ntypat)) diff --git a/src/79_seqpar_mpi/m_chebfiwf.F90 b/src/79_seqpar_mpi/m_chebfiwf.F90 index 6d9c0a6ddc..9a9259e1f0 100644 --- a/src/79_seqpar_mpi/m_chebfiwf.F90 +++ b/src/79_seqpar_mpi/m_chebfiwf.F90 @@ -278,12 +278,6 @@ subroutine chebfiwf2(cg,dtset,eig,enl_out,gs_hamk,kinpw,mpi_enreg,& type(pawcprj_type) :: cprj_dum(gs_hamk%natom,1) #if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL) - ! other - integer(kind=c_size_t) :: l_pcon_size_bytes -#endif - -#if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL) - ! other integer(kind=c_size_t) :: l_pcon_size_bytes #endif @@ -581,7 +575,6 @@ subroutine getghc_gsc1(X,AX,BX,transposer) ! ABI_MALLOC(l_gvnlxc,(2,blockdim*spacedim)) !end if - call multithreaded_getghc(l_cpopt,cg,cprj_dum,ghc,gsc,& l_gs_hamk,l_gvnlxc,eval,l_mpi_enreg,blockdim,l_prtvol,l_sij_opt,l_tim_getghc,0) @@ -590,14 +583,6 @@ subroutine getghc_gsc1(X,AX,BX,transposer) call gpu_device_synchronize() #endif - -#if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL) - !if (chebfi%gpu_option==ABI_GPU_KOKKOS) then - call gpu_device_synchronize() - !end if -#endif - - !Scale cg, ghc, gsc if ( l_istwf == 2 ) then call xgBlock_scale(X ,sqrt2,1,gpu_option=l_gs_hamk%gpu_option) @@ -783,7 +768,6 @@ subroutine getBm1X(X,Bm1X,transposer) !cwaveprj_next is dummy if(gemm_nonlop_use_gemm) then ABI_MALLOC(cwaveprj_next, (1,1)) - else else ABI_MALLOC(cwaveprj_next, (l_gs_hamk%natom,l_nspinor*blockdim)) call pawcprj_alloc(cwaveprj_next,0,l_gs_hamk%dimcprj) @@ -859,11 +843,11 @@ subroutine getBm1X(X,Bm1X,transposer) end if end if end if + if (l_paw) then - if (l_useria /= 121212) then - ABI_FREE(cwaveprj_next) - end if + ABI_FREE(cwaveprj_next) end if + ABI_NVTX_END_RANGE() end subroutine getBm1X diff --git a/src/79_seqpar_mpi/m_lobpcgwf.F90 b/src/79_seqpar_mpi/m_lobpcgwf.F90 index 033f22ad00..3f5cec2598 100644 --- a/src/79_seqpar_mpi/m_lobpcgwf.F90 +++ b/src/79_seqpar_mpi/m_lobpcgwf.F90 @@ -122,7 +122,7 @@ subroutine lobpcgwf2(cg,dtset,eig,occ,enl_out,gs_hamk,isppol,ikpt,inonsc,istep,k ! Important things for NC integer,parameter :: choice=1, paw_opt=0, signs=1 - type(pawcprj_type) :: cprj_dum(gs_hamk%natom,1) + type(pawcprj_type) :: cprj_dum(1,1) integer :: iband, shift real(dp) :: gsc_dummy(0,0) real(dp), allocatable :: l_gvnlxc(:,:) @@ -355,7 +355,7 @@ end subroutine lobpcgwf2 type(xgBlock_t), intent(inout) :: BX integer :: blockdim integer :: spacedim - type(pawcprj_type) :: cprj_dum(l_gs_hamk%natom,1) + type(pawcprj_type) :: cprj_dum(1,1) double precision :: dum double precision, parameter :: inv_sqrt2 = 1/sqrt2 double precision, pointer :: cg(:,:) diff --git a/src/79_seqpar_mpi/m_vtorho.F90 b/src/79_seqpar_mpi/m_vtorho.F90 index 279bfb5dda..c6f178a78b 100644 --- a/src/79_seqpar_mpi/m_vtorho.F90 +++ b/src/79_seqpar_mpi/m_vtorho.F90 @@ -1031,15 +1031,6 @@ subroutine vtorho(afford,atindx,atindx1,cg,compch_fft,cprj,cpus,dbl_nnsclo,& gs_hamk%ph3d_k,gs_hamk%kpt_k,gs_hamk%kg_k,gs_hamk%kpg_k, & compute_grad_atom=(optforces>0)) end if - else - ABI_ERROR("istwfk > 2 is not handled with OpenMP GPU offload mode !") - end if - call make_gemm_nonlop_ompgpu(my_ikpt,gs_hamk%npw_fft_k,gs_hamk%lmnmax, & - gs_hamk%ntypat, gs_hamk%indlmn, gs_hamk%nattyp, gs_hamk%istwf_k, & - gs_hamk%ucvol, gs_hamk%ffnl_k, & - gs_hamk%ph3d_k,gs_hamk%kpt_k,gs_hamk%kg_k,gs_hamk%kpg_k, & - compute_grad_atom=(optforces>0)) - end if end if end if diff --git a/src/94_scfcv/m_outscfcv.F90 b/src/94_scfcv/m_outscfcv.F90 index 42cda9ffc7..ea9b5ac415 100644 --- a/src/94_scfcv/m_outscfcv.F90 +++ b/src/94_scfcv/m_outscfcv.F90 @@ -596,7 +596,7 @@ subroutine outscfcv(atindx1,cg,compch_fft,compch_sph,cprj,dimcprj,dmatpawu,dtfil ! Output of the GSR file (except when we are inside mover) #ifdef HAVE_NETCDF -#if 0 +#ifndef FC_CRAY if (me == master .and. dtset%prtgsr == 1 .and. dtset%usewvl == 0) then !.and. (dtset%ionmov /= 0 .or. dtset%optcell /= 0)) then fname = strcat(dtfil%filnam_ds(4), "_GSR.nc") diff --git a/src/95_drive/m_gstate.F90 b/src/95_drive/m_gstate.F90 index a0bc34b9c2..f029c4d9e1 100644 --- a/src/95_drive/m_gstate.F90 +++ b/src/95_drive/m_gstate.F90 @@ -456,9 +456,6 @@ subroutine gstate(args_gs,acell,codvsn,cpui,dtfil,dtset,iexit,initialized,& else if(dtset%gpu_option == ABI_GPU_DISABLED) then call init_gemm_nonlop(dtset%nkpt) end if - else if(dtset%gpu_option == ABI_GPU_DISABLED) then - call init_gemm_nonlop(dtset%nkpt) - end if end if gemm_nonlop_is_distributed = .false. @@ -1812,8 +1809,6 @@ subroutine gstate(args_gs,acell,codvsn,cpui,dtfil,dtset,iexit,initialized,& call destroy_gemm_nonlop(dtset%nkpt) else if(dtset%gpu_option==ABI_GPU_DISABLED) then call destroy_gemm_nonlop(dtset%nkpt) - else if(dtset%gpu_option==ABI_GPU_DISABLED) then - call destroy_gemm_nonlop(dtset%nkpt) end if gemm_nonlop_use_gemm = .false. end if diff --git a/src/98_main/abinit.F90 b/src/98_main/abinit.F90 index de0899e844..b905e5e9f8 100644 --- a/src/98_main/abinit.F90 +++ b/src/98_main/abinit.F90 @@ -68,8 +68,9 @@ #include "abi_common.h" -! nvtx related macro definition +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) #include "nvtx_macros.h" +#endif program abinit @@ -385,7 +386,7 @@ program abinit end if #endif -#ifdef HAVE_GPU_MARKERS +#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) NVTX_INIT(use_nvtx) #endif