Solve merge conflicts

2023-12-17 21:32:32 +01:00 · 2023-12-17 21:32:32 +01:00 · bd2354497d
parent 420ed8ab0f
commit bd2354497d
32 changed files with 139 additions and 180 deletions
--- a/config.h.cmake
+++ b/config.h.cmake
@ -366,6 +366,9 @@
 /* Define to 1 if you want to activate support for OpenMP GPU offload. */
 #cmakedefine HAVE_OPENMP_OFFLOAD @HAVE_OPENMP_OFFLOAD@

+/* Define to 1 if you want to activate support for OpenMP GPU offload. */
+#cmakedefine HAVE_OPENMP_OFFLOAD @HAVE_OPENMP_OFFLOAD_DATASTRUCTURE@
+
 /* Set to 1 if OpenMP has a working implementation of COLLAPSE. */
 #cmakedefine HAVE_OMP_COLLAPSE @HAVE_OMP_COLLAPSE@

--- a/shared/common/src/17_gpu_toolbox/cuda_api_error_check.h
+++ b/shared/common/src/17_gpu_toolbox/cuda_api_error_check.h
@ -485,6 +485,8 @@ inline const char* _ConvertSMVer2ArchName(int major, int minor) {
      {0x75, "Turing"},
      {0x80, "Ampere"},
      {0x86, "Ampere"},
+      {0x89, "AdaLovelace"},
+      {0x90, "Hopper"},
      {-1, "Graphics Device"}};

  int index = 0;
--- a/shared/common/src/17_gpu_toolbox/hip_api_error_check.h
+++ b/shared/common/src/17_gpu_toolbox/hip_api_error_check.h
@ -499,6 +499,8 @@ inline const char* _ConvertSMVer2ArchName(int major, int minor) {
      {0x75, "Turing"},
      {0x80, "Ampere"},
      {0x86, "Ampere"},
+      {0x89, "AdaLovelace"},
+      {0x90, "Hopper"},
      {-1, "Graphics Device"}};

  int index = 0;
--- a/shared/common/src/17_gpu_toolbox/m_initcuda.F90
+++ b/shared/common/src/17_gpu_toolbox/m_initcuda.F90
@ -418,39 +418,24 @@ end subroutine Get_Mem_Dev
 #if defined HAVE_GPU

 ! Closing YAKL and Kokkos if opened
- if (gpu_option==ABI_GPU_KOKKOS .or. gpu_option==ABI_GPU_LEGACY) then
+ if (gpu_option==ABI_GPU_KOKKOS) then
 #ifdef HAVE_YAKL
   call gator_finalize()
   write(std_out,*)'yakl gator finalized'
 #endif
-
 #ifdef HAVE_KOKKOS
   ! finalize kokkos
   call kokkos_finalize()
   write(std_out,*)'kokkos finalized'
 #endif
+ !kokkos_finalize already reset GPU context
+ !if (gpu_option/=ABI_GPU_KOKKOS) call unset_dev()
 end if

- ! kokkos_finalize already reset GPU context
- !if (gpu_option/=ABI_GPU_KOKKOS) call unset_dev()
-
- ! Closing YAKL and Kokkos if opened
- if (gpu_option==ABI_GPU_KOKKOS .or. gpu_option==ABI_GPU_LEGACY) then
-#ifdef HAVE_YAKL
-   call gator_finalize()
-   write(std_out,*)'yakl gator finalized'
-#endif
-
-#ifdef HAVE_KOKKOS
-   ! finalize kokkos
-   call kokkos_finalize()
-   write(std_out,*)'kokkos finalized'
-#endif
+ if (gpu_option==ABI_GPU_LEGACY) then
+   call unset_dev()
 end if

- ! kokkos_finalize already reset GPU context
- !if (gpu_option/=ABI_GPU_KOKKOS) call unset_dev()
-
 #endif
 end subroutine unsetdevice_cuda
 !!***
--- a/shared/common/src/27_toolbox_oop/m_nctk.F90
+++ b/shared/common/src/27_toolbox_oop/m_nctk.F90
@ -583,7 +583,7 @@ subroutine nctk_test_mpiio(print_warning)

 !FIXME nf90create fails when using NVHPC
 ! This might be due to my environment, maybe not, need to investigate this...
-#ifndef FC_NVHPC
+!!#ifndef FC_NVHPC
 #ifdef HAVE_NETCDF_MPI
 if (xmpi_comm_rank(xmpi_world) == master) then
   ! Try to open a file with hdf5.
@ -619,7 +619,7 @@ subroutine nctk_test_mpiio(print_warning)
   ABI_WARNING(msg)
 end if
 #endif
-#endif
+!!#endif

 #ifdef HAVE_NETCDF_DEFAULT
 if (.not. nctk_has_mpiio) then
--- a/shared/common/src/28_numeric_noabirule/abi_gpu_linalg.f90
+++ b/shared/common/src/28_numeric_noabirule/abi_gpu_linalg.f90
@ -5002,7 +5002,7 @@ end subroutine abi_gpu_xcopy_2z
 !!  b
 !!
 !! SIDE EFFECTS
-!!   WARNING! : this routine is a dummy one when HAVE_GPU_CUDA is not enabled
+!!   WARNING! : this routine is a dummy one when HAVE_GPU is not enabled
 !!   the correct one is in 17_toolbox/gpu_linalg.cu
 !!
 !! SOURCE
--- a/shared/common/src/28_numeric_noabirule/m_elpa.F90
+++ b/shared/common/src/28_numeric_noabirule/m_elpa.F90
@ -204,11 +204,6 @@ end subroutine elpa_func_uninit
 !! INPUTS
 !!  [blacs_ctx]= -- optional -- Blacs context
 !!  [gpu]= -- optional -- Flag (0 or 1): use GPU version (currently only NVidia)
-!!  na=Order of matrix A
-!!  nblk=Blocksize of cyclic distribution, must be the same in both directions!
-!!  local_nrows=Leading dimension of A
-!!  local_ncols=Local columns of matrixes A and Q (eigenvectors)
-!!  nev=Number of eigenvalues needed.
 !!
 !! SIDE EFFECTS
 !!  elpa_hdl(type<elpa_hdl_t>)= ELPA handle
@ -223,6 +218,7 @@ subroutine elpa_func_allocate(elpa_hdl,gpu,blacs_ctx)

 !Local variables-------------------------------
 integer :: err,l_gpu,l_blacs_ctx
+ logical :: gpu_debug_mode=.false.
 character(len=10) :: varname

 ! *********************************************************************
@ -261,49 +257,30 @@ subroutine elpa_func_allocate(elpa_hdl,gpu,blacs_ctx)
     ABI_ERROR("You seem to use an old version of ELPA ( < 2021.x ) which only supports NVIDIA GPUs.")
 #endif
   end if
-
-   call elpa_func_error_handler(err_code=err,err_msg='Error when enabling GPU on ELPA')
-   if (err==ELPA_OK) call elpa_hdl%elpa%set("debug",1,err)
-   call elpa_func_error_handler(err_code=err,err_msg='Error when enabling debug on ELPA')
- end if
-#else
- if (err==0.and.l_gpu==1) elpa_hdl%gpu=l_gpu
-#endif
-
- call elpa_func_error_handler(err_code=err,err_varname=varname)
-
   call elpa_func_error_handler(err_code=err,err_msg='Error when enabling GPU on ELPA')

-   !if (err==ELPA_OK) call elpa_hdl%elpa%set("debug",1,err) 
-   !call elpa_func_error_handler(err_code=err,err_msg='Error when enabling debug on ELPA')
+   if (gpu_debug_mode) then
+     if (err==ELPA_OK) call elpa_hdl%elpa%set("debug",1,err) 
+     call elpa_func_error_handler(err_code=err,err_msg='Error when enabling debug on ELPA')
+   end if

 end if
 #else
- if (err==0.and.l_gpu==1) elpa_hdl%gpu=l_gpu
+ if (err==0.and.l_gpu==1) then
+   elpa_hdl%gpu=l_gpu
+   if (gpu_debug_mode) elpa_hdl%debug=1
+ end if
 #endif

- call elpa_func_error_handler(err_code=err,err_varname=varname)
-
 if (present(blacs_ctx)) then
   if (err==ELPA_OK) call elpa_hdl%elpa%set("blacs_context",int(blacs_ctx,kind=c_int),err)
+   call elpa_func_error_handler(err_code=err,err_varname=varname)
 end if

- elpa_hdl%is_allocated=.true.
-
- ! Setting matrix size
- call elpa_func_set_matrix(elpa_hdl,na,nblk,local_nrows,local_ncols,nev)
-
- if (present(blacs_ctx)) then
-   if (err==ELPA_OK) call elpa_hdl%elpa%set("blacs_context",int(blacs_ctx,kind=c_int),err)
- end if
-
- ! Proper ELPA setup
- err = elpa_hdl%elpa%setup()
- call elpa_func_error_handler(err_code=err,err_msg='Error during ELPA setup')
-
 #if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
 call nvtxEndRange()
 #endif
+
 end subroutine elpa_func_allocate
 !!***

@ -453,6 +430,12 @@ subroutine elpa_func_get_communicators(elpa_hdl,mpi_comm_parent,process_row,proc
   varname='process_col'
   call elpa_hdl%elpa%set(trim(varname),process_col,err)
 end if
+ if (err==ELPA_OK) then
+   varname=''
+   err = elpa_hdl%elpa%setup()
+   call elpa_func_error_handler(err_code=err,err_msg='Error during ELPA setup')
+ endif
+
 #else
 elpa_hdl%mpi_comm_parent=mpi_comm_parent
 elpa_hdl%process_row=process_row
@ -467,10 +450,13 @@ subroutine elpa_func_get_communicators(elpa_hdl,mpi_comm_parent,process_row,proc
 !ELPA-LEGACY-2017
 err=elpa_get_communicators(mpi_comm_parent,process_row,process_col,elpa_hdl%elpa_comm_rows,elpa_hdl%elpa_comm_cols)
 #endif
+
 #endif

 call elpa_func_error_handler(err_code=err,err_msg='Error in elpa_get_communicators',err_varname=varname)

+ elpa_hdl%is_allocated=.true.
+
 end subroutine elpa_func_get_communicators
 !!***

--- a/shared/common/src/28_numeric_noabirule/m_slk.F90
+++ b/shared/common/src/28_numeric_noabirule/m_slk.F90
@ -2687,7 +2687,7 @@ subroutine compute_eigen_problem(processor, matrix, results, eigen, comm, istwf_

  call elpa_func_allocate(elpa_hdl,gpu=use_gpu_elpa_)
  call elpa_func_set_matrix(elpa_hdl,matrix%sizeb_global(1),matrix%sizeb_blocs(1),nev__,&
-&                           matrix%sizeb_local(1),matrix%sizeb_local(2),nev__,gpu=use_gpu)
+&                           matrix%sizeb_local(1),matrix%sizeb_local(2))
  call elpa_func_get_communicators(elpa_hdl,processor%comm,processor%coords(1),processor%coords(2))

  if (istwf_k/=2) then
@ -2912,6 +2912,10 @@ subroutine solve_gevp_complex(na,nev,na_rows,na_cols,nblk,a,b,ev,z,tmp1,tmp2, &
  if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa
 #endif

+#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+  call nvtxStartRange("solve_gevp_complex",12)
+#endif
+
 ! Allocate ELPA handle
  call elpa_func_allocate(elpa_hdl,blacs_ctx=sc_desc(CTXT_),gpu=use_gpu_elpa_)
  call elpa_func_set_matrix(elpa_hdl,na,nblk,nev,na_rows,na_cols)
@ -2920,6 +2924,7 @@ subroutine solve_gevp_complex(na,nev,na_rows,na_cols,nblk,a,b,ev,z,tmp1,tmp2, &
  call elpa_func_solve_gevp_2stage(elpa_hdl,a,b,z,ev,nev)

  call elpa_func_deallocate(elpa_hdl)
+
 #if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
  call nvtxEndRange()
 #endif
@ -2957,6 +2962,10 @@ subroutine solve_gevp_real(na,nev,na_rows,na_cols,nblk,a,b,ev,z,tmp1,tmp2, &
  if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa
 #endif

+#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+  call nvtxStartRange("solve_gevp_real",12)
+#endif
+
 ! Allocate ELPA handle
  call elpa_func_allocate(elpa_hdl,blacs_ctx=sc_desc(CTXT_),gpu=use_gpu_elpa_)
  call elpa_func_set_matrix(elpa_hdl,na,nblk,nev,na_rows,na_cols)
@ -2999,6 +3008,10 @@ subroutine solve_gevp_real(na,nev,na_rows,na_cols,nblk,a,b,ev,z,tmp1,tmp2, &

  call elpa_func_deallocate(elpa_hdl)

+#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+  call nvtxEndRange()
+#endif
+
 end subroutine solve_gevp_real
 !!***
 #endif
@ -3050,6 +3063,7 @@ subroutine compute_generalized_eigen_problem(processor,matrix1,matrix2,results,e
 #if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
  call nvtxStartRange("slk_compute_generalized_eigen", 10)
 #endif
+
  nev__ = matrix1%sizeb_global(2); if (present(nev)) nev__ = nev
  use_gpu_elpa__ = 0
 #ifdef HAVE_LINALG_ELPA
@ -3082,6 +3096,7 @@ subroutine compute_generalized_eigen_problem(processor,matrix1,matrix2,results,e
 #if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
  call nvtxEndRange()
 #endif
+
 #else
 !Arguments ------------------------------------
  class(processor_scalapack),intent(in)       :: processor
@ -3292,6 +3307,10 @@ subroutine compute_eigen1(comm,processor,cplex,nbli_global,nbco_global,matrix,ve
 if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa
 #endif

+#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+ call nvtxStartRange("slk_compute_eigen1", 7)
+#endif
+
 ! ================================
 ! INITIALISATION SCALAPACK MATRIX
 ! ================================
@ -3358,6 +3377,10 @@ subroutine compute_eigen1(comm,processor,cplex,nbli_global,nbco_global,matrix,ve
 ABI_SFREE(z_tmp_evec)
 ABI_SFREE(r_tmp_evec)

+#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+ call nvtxEndRange()
+#endif
+
 #ifndef HAVE_LINALG_ELPA
 ABI_UNUSED(use_gpu_elpa)
 #endif
@ -3423,6 +3446,10 @@ subroutine compute_eigen2(comm,processor,cplex,nbli_global,nbco_global,matrix1,m
 if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa
 #endif

+#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+ call nvtxStartRange("slk_compute_eigen2", 7)
+#endif
+
 ! ================================
 ! INITIALISATION SCALAPACK MATRIX
 ! ================================
@ -3495,13 +3522,14 @@ subroutine compute_eigen2(comm,processor,cplex,nbli_global,nbco_global,matrix1,m
 call sca_matrix2%free()
 call sca_matrix3%free()

+#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+ call nvtxEndRange()
+#endif
+
 #ifndef HAVE_LINALG_ELPA
 ABI_UNUSED(use_gpu_elpa)
 #endif

-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
- call nvtxEndRange()
-#endif
 end subroutine compute_eigen2
 !!***

--- a/split/abinit_cpp_options.yml
+++ b/split/abinit_cpp_options.yml
@ -29,10 +29,14 @@ cpp_options:
 - DEV_YP_DEBUG_PSP
 - DEV_YP_VDWXC
 - FC_ABSOFT
+- FC_CRAY
+- FC_FLANG
 - FC_GNU
 - FC_IBM
 - FC_INTEL
+- FC_LLVM
 - FC_NAG
+- FC_NVHPC
 - FC_PGI
 - FFT_PRECISION
 - GPU_FOUR_HEADER_H
@ -88,6 +92,7 @@ cpp_options:
 - HAVE_GPU_CUDA_DP
 - HAVE_GPU_CUDA_SP
 - HAVE_GPU_CUDA_TM
+- HAVE_GPU_HIP
 - HAVE_GPU_MPI
 - HAVE_GPU_SERIAL
 - HAVE_GW_DPC
@ -154,6 +159,8 @@ cpp_options:
 - HAVE_NUMPY
 - HAVE_OMP_COLLAPSE
 - HAVE_OPENMP
+- HAVE_OPENMP_OFFLOAD
+- HAVE_OPENMP_OFFLOAD_DATASTRUCTURE
 - HAVE_OS_LINUX
 - HAVE_OS_MACOSX
 - HAVE_OS_WINDOWS
--- a/src/44_abitypes_defs/m_dtset.F90
+++ b/src/44_abitypes_defs/m_dtset.F90
@ -124,7 +124,6 @@ type, public :: dataset_type
 integer :: diismemory
 integer :: dipdip = 1
 integer :: dipquad = 1
- integer :: distribute_gemm_nonlop = 0
 integer :: dmatpuopt
 integer :: dmatudiag
 integer :: dmft_dc
@ -202,7 +201,6 @@ type, public :: dataset_type
 integer :: ga_algor
 integer :: ga_fitness
 integer :: ga_n_rules
- integer :: gemm_nonlop_split_size = 1
 integer :: getcell = 0
 integer :: getddb = 0
 integer :: getdvdb = 0
@ -623,7 +621,6 @@ type, public :: dataset_type
 integer :: tl_nprccg
 !U
 integer :: ucrpa
- integer :: use_gpu_openmp_threads
 integer :: usedmatpu
 integer :: usedmft
 integer :: useexexch
@ -1445,7 +1442,6 @@ type(dataset_type) function dtset_copy(dtin) result(dtout)
 dtout%delayperm          = dtin%delayperm
 dtout%diismemory         = dtin%diismemory
 dtout%dipquad            = dtin%dipquad
- dtout%distribute_gemm_nonlop = dtin%distribute_gemm_nonlop
 dtout%dmatpuopt          = dtin%dmatpuopt
 dtout%dmatudiag          = dtin%dmatudiag
 dtout%dmft_dc            = dtin%dmft_dc
@ -1578,7 +1574,6 @@ type(dataset_type) function dtset_copy(dtin) result(dtout)
 dtout%ga_algor           = dtin%ga_algor
 dtout%ga_fitness         = dtin%ga_fitness
 dtout%ga_n_rules         = dtin%ga_n_rules
- dtout%gemm_nonlop_split_size = dtin%gemm_nonlop_split_size
 dtout%getbseig           = dtin%getbseig
 dtout%getbsreso          = dtin%getbsreso
 dtout%getbscoup          = dtin%getbscoup
@ -1994,7 +1989,6 @@ type(dataset_type) function dtset_copy(dtin) result(dtout)
 dtout%tim1rev            = dtin%tim1rev
 dtout%timopt             = dtin%timopt
 dtout%use_gemm_nonlop    = dtin%use_gemm_nonlop
- dtout%use_gpu_openmp_threads = dtin%use_gpu_openmp_threads
 dtout%useextfpmd         = dtin%useextfpmd
 dtout%use_yaml           = dtin%use_yaml   ! This variable activates the Yaml output for testing purposes
                                            ! It will be removed when Yaml output enters production.
@ -3317,7 +3311,7 @@ subroutine chkvars(string)
 list_vars=trim(list_vars)//' delayperm densfor_pred densty dfield'
 list_vars=trim(list_vars)//' dfpt_sciss diecut diegap dielam dielng diemac'
 list_vars=trim(list_vars)//' diemix diemixmag diismemory'
- list_vars=trim(list_vars)//' dilatmx dipdip dipquad dipdip_prt dipdip_range distribute_gemm_nonlop'
+ list_vars=trim(list_vars)//' dilatmx dipdip dipquad dipdip_prt dipdip_range'
 list_vars=trim(list_vars)//' dmatpawu dmatpuopt dmatudiag'
 list_vars=trim(list_vars)//' dmftbandi dmftbandf dmftctqmc_basis'
 list_vars=trim(list_vars)//' dmftctqmc_check dmftctqmc_correl dmftctqmc_gmove'
@ -3365,7 +3359,7 @@ subroutine chkvars(string)
 list_vars=trim(list_vars)//' f4of2_sla f6of2_sla'
 !G
 list_vars=trim(list_vars)//' ga_algor ga_fitness ga_n_rules ga_opt_percent ga_rules'
- list_vars=trim(list_vars)//' gemm_nonlop_split_size genafm getbscoup getbseig getbsreso getcell'
+ list_vars=trim(list_vars)//' genafm getbscoup getbseig getbsreso getcell'
 list_vars=trim(list_vars)//' getddb getddb_filepath getden_filepath getddk'
 list_vars=trim(list_vars)//' getdelfd getdkdk getdkde getden getkden getdvdb getdvdb_filepath'
 list_vars=trim(list_vars)//' getefmas getkerange_filepath getgam_eig2nkq'
@ -3539,7 +3533,6 @@ subroutine chkvars(string)
 list_vars=trim(list_vars)//' userra userrb userrc userrd userre'
 list_vars=trim(list_vars)//' usewvl usexcnhat useylm use_gemm_nonlop'
 list_vars=trim(list_vars)//' use_slk useextfpmd use_yaml'
- list_vars=trim(list_vars)//' use_slk useextfpmd use_yaml'
 list_vars=trim(list_vars)//' use_oldchi'
 !V
 list_vars=trim(list_vars)//' vaclst vacnum vacuum vacwidth vcutgeo'
--- a/src/45_xgTools/m_xgTransposer.F90
+++ b/src/45_xgTools/m_xgTransposer.F90
@ -605,7 +605,7 @@ module m_xgTransposer
     call nvtxStartRange("MPI_AllToAllV", 8)
 #endif

-     if( xgTransposer%gou_option == ABI_GPU_KOKKOS) then
+     if( xgTransposer%gpu_option == ABI_GPU_KOKKOS) then

 #if defined(HAVE_GPU_CUDA) && defined(HAVE_KOKKOS) && defined(HAVE_YAKL)
       call timab(tim_all2allv,1,tsec)
@ -621,7 +621,6 @@ module m_xgTransposer
       recvbuf(:,:) = recvbuf_mpi(:,:)

       ABI_FREE(recvbuf_mpi)
-
 #endif

     else
--- a/src/46_ghc_omp/m_ompgpu_fourwf.F90
+++ b/src/46_ghc_omp/m_ompgpu_fourwf.F90
@ -205,13 +205,12 @@ subroutine ompgpu_fourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,ist

   cfft_size = 2*n1*n2*n3*ndat

-#ifdef HAVE_GPU_CUDA
+#if defined HAVE_GPU_CUDA
   byte_count=sizeof(work_gpu)
   !$OMP TARGET DATA USE_DEVICE_PTR(work_gpu)
   call gpu_memset(c_loc(work_gpu), 0, byte_count)
   !$OMP END TARGET DATA
-#endif
-#ifdef HAVE_GPU_HIP
+#elif defined HAVE_GPU_HIP
   !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(i1,i2,i3)  MAP(to:work_gpu)
   do i3=1,n3*ndat
     do i2=1,n2
@ -266,12 +265,11 @@ subroutine ompgpu_fourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,ist
         i1=kg_kin(1,ipw); if(i1<0)i1=i1+n1;
         i2=kg_kin(2,ipw); if(i2<0)i2=i2+n2;
         i3=kg_kin(3,ipw); if(i3<0)i3=i3+n3;
-#ifdef HAVE_GPU_CUDA
+#if defined HAVE_GPU_CUDA
         i1inv = modulo(shift_inv1 - i1, n1) + 1
         i2inv = modulo(shift_inv2 - i2, n2) + 1
         i3inv = modulo(shift_inv3 - i3, n3) + 1
-#endif
-#ifdef HAVE_GPU_HIP
+#elif defined HAVE_GPU_HIP
         i1inv = (shift_inv1-i1) - ( ((shift_inv1-i1)/n1) * n1 ) + 1
         i2inv = (shift_inv2-i2) - ( ((shift_inv2-i2)/n2) * n2 ) + 1
         i3inv = (shift_inv3-i3) - ( ((shift_inv3-i3)/n3) * n3 ) + 1
--- a/src/55_abiutil/m_timana.F90
+++ b/src/55_abiutil/m_timana.F90
@ -1513,7 +1513,7 @@ subroutine timana(mpi_enreg,natom,nband,ndtset,nfft,nkpt,npwtot,nsppol,timopt)

 percent_limit=0.5_dp
 if (timopt<0) percent_limit=0.0001_dp
- if (timopt<0) percent_limit=tol12
+ !if (timopt<0) percent_limit=tol12

 !In case there is parallelism, report times for node 0
 !if (me==0 .and. nproc>1) then
@ -1591,10 +1591,10 @@ subroutine timana(mpi_enreg,natom,nband,ndtset,nfft,nkpt,npwtot,nsppol,timopt)
 end if

 !Now, gather all information
- !call xmpi_sum(times,spaceworld,ierr)
- !call xmpi_sum(ncount,spaceworld,ierr)
- !call xmpi_sum(ftimes,spaceworld,ierr)
- !call xmpi_sum(nflops,spaceworld,ierr)
+ call xmpi_sum(times,spaceworld,ierr)
+ call xmpi_sum(ncount,spaceworld,ierr)
+ call xmpi_sum(ftimes,spaceworld,ierr)
+ call xmpi_sum(nflops,spaceworld,ierr)

 if (me==0) then ! Only the world master writes

--- a/src/57_iovars/m_invars1.F90
+++ b/src/57_iovars/m_invars1.F90
@ -2295,7 +2295,6 @@ subroutine indefo(dtsets, ndtset_alloc, nprocs)
   dtsets(idtset)%dielam=half
   dtsets(idtset)%diismemory=8
   dtsets(idtset)%dilatmx=one
-   dtsets(idtset)%distribute_gemm_nonlop=0
   dtsets(idtset)%dmatpuopt=2
   if (size(dtsets(idtset)%dmatpawu,4)>0) dtsets(idtset)%dmatpawu=-10._dp
   dtsets(idtset)%dmatudiag=0
--- a/src/62_ctqmc/m_BathOperatoroffdiag.F90
+++ b/src/62_ctqmc/m_BathOperatoroffdiag.F90
@ -257,10 +257,10 @@ SUBROUTINE BathOperatoroffdiag_init(op, flavors, samples, beta, iTech,opt_nondia
  FREEIF(op%F)
  MALLOC(op%F,(1:op%sizeHybrid+1,1:flavors,1:flavors))
  DT_FREEIF(op%tails)
-  DT_MALLOC(op%tails, (1:op%flavors))
+  DT_MALLOC(op%tails,(1:op%flavors))
  op%tails=0
  DT_FREEIF(op%Fshift)
-  DT_MALLOC(op%Fshift, (1:op%flavors+1))
+  DT_MALLOC(op%Fshift,(1:op%flavors+1))
  op%Fshift=0
  
  CALL Vector_init(op%R,100*op%flavors)
--- a/src/62_ctqmc/m_Ctqmc.F90
+++ b/src/62_ctqmc/m_Ctqmc.F90
@ -643,7 +643,7 @@ SUBROUTINE Ctqmc_allocateAll(this)
  this%measDE = 0.d0

  FREEIF(this%mu)
-  MALLOC(this%mu, (1:flavors) )
+  MALLOC(this%mu,(1:flavors) )
  this%mu = 0.d0
 END SUBROUTINE Ctqmc_allocateAll
 !!***
--- a/src/62_ctqmc/m_Ctqmcoffdiag.F90
+++ b/src/62_ctqmc/m_Ctqmcoffdiag.F90
@ -665,7 +665,7 @@ SUBROUTINE Ctqmcoffdiag_allocateAll(op)
  op%measDE = 0.d0

  FREEIF(op%mu)
-  MALLOC(op%mu, (1:flavors) )
+  MALLOC(op%mu,(1:flavors) )
  op%mu = 0.d0
  FREEIF(op%hybri_limit)
  MALLOC(op%hybri_limit, (flavors,flavors) )
--- a/src/66_nonlocal/m_gemm_nonlop_ompgpu.F90
+++ b/src/66_nonlocal/m_gemm_nonlop_ompgpu.F90
@ -8,7 +8,7 @@
 !!  which leads to excellent CPU efficiency and OpenMP scalability.
 !!
 !! COPYRIGHT
-!! Copyright (C) 2014-2022 ABINIT group (AL)
+!! Copyright (C) 2014-2022 ABINIT group (MS)
 !! This file is distributed under the terms of the
 !! GNU General Public License, see ~abinit/COPYING
 !! or http://www.gnu.org/copyleft/gpl.txt .
@ -360,7 +360,7 @@ contains
  end if

  if(allocated(temp_realvec_r)) then
-    !$OMP TARGET EXIT DATA MAP(delete:temp_realvec_r,temp_realvec_i)
+    !$OMP TARGET EXIT DATA MAP(delete:kwa m_hamiltonian.F90,temp_realvec_i)
    ABI_FREE(temp_realvec_r)
    ABI_FREE(temp_realvec_i)
  end if
@ -800,7 +800,7 @@ contains
  real(dp),intent(inout),target :: vectin(2,npwin*nspinor*ndat)
  real(dp),intent(inout) :: enlout(nnlout*ndat)
  real(dp),intent(out),target :: svectout(:,:)
-  real(dp),intent(inout),target :: vectout(:,:) !vz_i
+  real(dp),intent(inout),target :: vectout(:,:)
  real(dp),intent(inout),optional, ABI_CONTIGUOUS target :: vectproj(:,:,:)
  type(pawcprj_type),intent(inout) :: cprjin(natom,nspinor*((cpopt+5)/5)*ndat)

@ -826,9 +826,7 @@ contains
  character(len=500) :: msg
  integer(C_SIZE_T) :: byte_count
 #ifdef HAVE_GPU_HIP
-  type(c_ptr) :: vectin_amdcopy
-  type(c_ptr) :: vectout_amdcopy
-  type(c_ptr) :: svectout_amdcopy
+  type(c_ptr) :: vectin_amdcopy,vectout_amdcopy,svectout_amdcopy
 #endif

 ! *************************************************************************
--- a/src/66_nonlocal/m_hamiltonian.F90
+++ b/src/66_nonlocal/m_hamiltonian.F90
@ -191,7 +191,6 @@ module m_hamiltonian
  ! Governs the choice of the GPU implementation:
  !        = 0 ==> do not use GPU
  !        > 0 ==> see defs_basis.F90 to have the list of possible GPU implementations
-  !        = 666 ==> use openMP GPU implementation of hamiltonian operators

  integer :: usecprj
   ! usecprj= 1 if cprj projected WF are stored in memory
--- a/src/66_wfs/m_getghc_ompgpu.F90
+++ b/src/66_wfs/m_getghc_ompgpu.F90
@ -759,6 +759,7 @@ has_fock=.false.
 #ifndef HAVE_GPU_HIP
   !$OMP TARGET EXIT DATA MAP(delete:work)
 #endif
+
 end if ! type_calc
 ABI_NVTX_END_RANGE()

@ -768,6 +769,7 @@ has_fock=.false.
 !============================================================
 ! Application of the non-local potential and the Fock potential
 !============================================================
+
   ABI_NVTX_START_RANGE(NVTX_GETGHC_NLOCPOT)
   if (type_calc==0 .or. type_calc==2) then
     signs=2 ; choice=1 ; nnlout=1 ; idir=0 ; tim_nonlop=1
--- a/src/66_wfs/m_invovl.F90
+++ b/src/66_wfs/m_invovl.F90
@ -1467,8 +1467,7 @@ subroutine solve_inner_ompgpu(invovl, ham, cplx, mpi_enreg, proj, ndat, sm1proj,
 integer :: additional_steps_to_take,idat,iproj,icplx
 integer :: Ptsize(3)
 #ifdef HAVE_GPU_HIP
- type(c_ptr) :: sm1proj_amdcopy
- type(c_ptr) :: PtPsm1proj_amdcopy
+ type(c_ptr) :: sm1proj_amdcopy,PtPsm1proj_amdcopy
 #endif

 ! *************************************************************************
--- a/src/66_wfs/m_prep_kgb.F90
+++ b/src/66_wfs/m_prep_kgb.F90
@ -53,7 +53,7 @@ module m_prep_kgb
 use m_ompgpu_fourwf
 #endif

-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
  use m_nvtx
 #endif

@ -295,7 +295,7 @@ subroutine prep_getghc(cwavef, gs_hamk, gvnlxc, gwavef, swavef, lambda, blocksiz

 if(do_transpose) then
   call timab(545,3,tsec)
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxStartRange("MPI_AllToAllV", 8)
 #endif
   if ( ((.not.flag_inv_sym) .and. bandpp==1 .and. mpi_enreg%paral_spinor==0 .and. my_nspinor==2 ).or. &
@ -306,7 +306,7 @@ subroutine prep_getghc(cwavef, gs_hamk, gvnlxc, gwavef, swavef, lambda, blocksiz
     call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,&
 &     recvcountsloc,rdisplsloc,spaceComm,ier)
   end if
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxEndRange()
 #endif
   call timab(545,2,tsec)
@ -527,56 +527,56 @@ subroutine prep_getghc(cwavef, gs_hamk, gvnlxc, gwavef, swavef, lambda, blocksiz
   if ( ((.not.flag_inv_sym) .and. bandpp==1 .and. mpi_enreg%paral_spinor==0 .and. my_nspinor==2 ).or. &
 &   ((.not.flag_inv_sym) .and. bandpp>1) .or.  flag_inv_sym  ) then
     if (sij_opt==1) then
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
       call nvtxStartRange("MPI_AllToAllV", 8)
 #endif
       call xmpi_alltoallv(swavef_alltoall1,recvcountsloc,rdisplsloc,swavef,&
 &       sendcountsloc,sdisplsloc,spaceComm,ier)
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxEndRange()
 #endif
     end if
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxStartRange("MPI_AllToAllV", 8)
 #endif
     if (.not.local_gvnlxc) call xmpi_alltoallv(gvnlxc_alltoall1,recvcountsloc,rdisplsloc,gvnlxc,&
 &     sendcountsloc,sdisplsloc,spaceComm,ier)
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxEndRange()
 #endif
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxStartRange("MPI_AllToAllV", 8)
 #endif
     call xmpi_alltoallv(gwavef_alltoall1,recvcountsloc,rdisplsloc,gwavef,&
 &     sendcountsloc,sdisplsloc,spaceComm,ier)
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxEndRange()
 #endif
   else
     if (sij_opt==1) then
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxStartRange("MPI_AllToAllV", 8)
 #endif
       call xmpi_alltoallv(swavef_alltoall2,recvcountsloc,rdisplsloc,swavef,&
 &       sendcountsloc,sdisplsloc,spaceComm,ier)
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxEndRange()
 #endif
     end if
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxStartRange("MPI_AllToAllV", 8)
 #endif
     if (.not.local_gvnlxc) call xmpi_alltoallv(gvnlxc_alltoall2,recvcountsloc,rdisplsloc,gvnlxc,&
 &     sendcountsloc,sdisplsloc,spaceComm,ier)
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxEndRange()
 #endif
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxStartRange("MPI_AllToAllV", 8)
 #endif
     call xmpi_alltoallv(gwavef_alltoall2,recvcountsloc,rdisplsloc,gwavef,&
 &     sendcountsloc,sdisplsloc,spaceComm,ier)
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxEndRange()
 #endif
   end if
@ -860,7 +860,7 @@ subroutine prep_nonlop(choice,cpopt,cwaveprj,enlout_block,hamk,idir,lambdablock,

 if(do_transpose) then
   call timab(581,1,tsec)
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxStartRange("MPI_AllToAllV", 8)
 #endif
   if (bandpp/=1 .or. (bandpp==1 .and. mpi_enreg%paral_spinor==0.and.nspinortot==2)) then
@ -892,7 +892,7 @@ subroutine prep_nonlop(choice,cpopt,cwaveprj,enlout_block,hamk,idir,lambdablock,
      call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,&
           &     recvcountsloc,rdisplsloc,spaceComm,ier)
   end if
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxEndRange()
 #endif
   call timab(581,2,tsec)
@ -1000,43 +1000,43 @@ subroutine prep_nonlop(choice,cpopt,cwaveprj,enlout_block,hamk,idir,lambdablock,
     call timab(581,1,tsec)
     if(bandpp/=1 .or. (bandpp==1 .and. mpi_enreg%paral_spinor==0.and.nspinortot==2))then
       if (paw_opt/=3) then
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxStartRange("MPI_AllToAllV", 8)
 #endif
         call xmpi_alltoallv(gvnlc_alltoall1,recvcountsloc,rdisplsloc,gvnlc,&
 &         sendcountsloc,sdisplsloc,spaceComm,ier)
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxEndRange()
 #endif
       end if
       if (paw_opt==3.or.paw_opt==4) then
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxStartRange("MPI_AllToAllV", 8)
 #endif
         call xmpi_alltoallv(gsc_alltoall1,recvcountsloc,rdisplsloc,gsc,&
 &         sendcountsloc,sdisplsloc,spaceComm,ier)
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxEndRange()
 #endif
       end if
     else
       if (paw_opt/=3) then
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxStartRange("MPI_AllToAllV", 8)
 #endif
         call xmpi_alltoallv(gvnlc_alltoall2,recvcountsloc,rdisplsloc,gvnlc,&
 &         sendcountsloc,sdisplsloc,spaceComm,ier)
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxEndRange()
 #endif
       end if
       if (paw_opt==3.or.paw_opt==4) then
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxStartRange("MPI_AllToAllV", 8)
 #endif
         call xmpi_alltoallv(gsc_alltoall2,recvcountsloc,rdisplsloc,gsc,&
 &         sendcountsloc,sdisplsloc,spaceComm,ier)
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxEndRange()
 #endif
       end if
@ -1282,7 +1282,7 @@ subroutine prep_fourwf(rhoaug,blocksize,cwavef,wfraug,iblock,istwf_k,mgfft,&
 sdisplsloc(:)=sdispls(:)*2

 call timab(547,1,tsec)
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxStartRange("MPI_AllToAllV", 8)
 #endif
 #if defined HAVE_GPU && defined HAVE_YAKL
@ -1305,7 +1305,7 @@ subroutine prep_fourwf(rhoaug,blocksize,cwavef,wfraug,iblock,istwf_k,mgfft,&
 call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,&
      & recvcountsloc,rdisplsloc,spaceComm,ier)
 #endif
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
     call nvtxEndRange()
 #endif
 call timab(547,2,tsec)
--- a/src/67_common/m_forstr.F90
+++ b/src/67_common/m_forstr.F90
@ -975,13 +975,6 @@ subroutine forstrnps(cg,cprj,ecut,ecutsm,effmass_free,eigen,electronpositron,foc
             gs_hamk%ucvol,  gs_hamk%ffnl_k, gs_hamk%ph3d_k, gs_hamk%kpt_k, &
             gs_hamk%kg_k, gs_hamk%kpg_k, &
             compute_grad_strain=(stress_needed>0),compute_grad_atom=(optfor>0))
-       !!FIXME signs==1 not handled in CUDA GEMM nonlop
-       !else if ( gpu_option /= ABI_GPU_LEGACY) then
-       !  call make_gemm_nonlop_ompgpu(my_ikpt,gs_hamk%npw_fft_k,gs_hamk%lmnmax, &
-       !      gs_hamk%ntypat, gs_hamk%indlmn, gs_hamk%nattyp, gs_hamk%istwf_k, &
-       !      gs_hamk%ucvol,  gs_hamk%ffnl_k, gs_hamk%ph3d_k, gs_hamk%kpt_k, &
-       !      gs_hamk%kg_k, gs_hamk%kpg_k, &
-       !      compute_grad_strain=(stress_needed>0),compute_grad_atom=(optfor>0))
       else if ( gpu_option == ABI_GPU_OPENMP) then
         call make_gemm_nonlop_ompgpu(my_ikpt,gs_hamk%npw_fft_k,gs_hamk%lmnmax, &
             gs_hamk%ntypat, gs_hamk%indlmn, gs_hamk%nattyp, gs_hamk%istwf_k, &
@ -989,15 +982,6 @@ subroutine forstrnps(cg,cprj,ecut,ecutsm,effmass_free,eigen,electronpositron,foc
             gs_hamk%kg_k, gs_hamk%kpg_k, &
             compute_grad_strain=(stress_needed>0),compute_grad_atom=(optfor>0))
       end if
-         else
-           ABI_ERROR("istwfk > 2 is not handled with OpenMP GPU offload mode !")
-         end if
-         call make_gemm_nonlop_ompgpu(my_ikpt,gs_hamk%npw_fft_k,gs_hamk%lmnmax, &
-             gs_hamk%ntypat, gs_hamk%indlmn, gs_hamk%nattyp, gs_hamk%istwf_k, &
-             gs_hamk%ucvol,  gs_hamk%ffnl_k, gs_hamk%ph3d_k, gs_hamk%kpt_k, &
-             gs_hamk%kg_k, gs_hamk%kpg_k, &
-             compute_grad_strain=(stress_needed>0),compute_grad_atom=(optfor>0))
-       end if
     end if

 !    Loop over (blocks of) bands; accumulate forces and/or stresses
--- a/src/67_common/m_mkrho.F90
+++ b/src/67_common/m_mkrho.F90
@ -180,6 +180,10 @@ subroutine mkrho(cg,dtset,gprimd,irrzon,kg,mcg,mpi_enreg,npwarr,occ,paw_dmft,phn
 !arrays
 integer,allocatable :: gbound(:,:)
 logical :: locc_test,nspinor1TreatedByThisProc,nspinor2TreatedByThisProc
+ real(dp),allocatable :: occ_diag(:),cwavef_rot(:,:,:,:)
+#if defined HAVE_GPUL
+ real(dp),allocatable :: weight_t(:) ! only allocated and used when use_gpu_cuda = 1
+#endif
 #if defined HAVE_GPU && defined HAVE_YAKL
 integer(int32),ABI_CONTIGUOUS pointer :: kg_k(:,:) => null()
 real(real64) :: dummy(2,1) = reshape( (/0.0, 0.0/), shape(dummy))
--- a/src/69_wfdesc/m_wfd.F90
+++ b/src/69_wfdesc/m_wfd.F90
@ -3718,7 +3718,7 @@ subroutine wfd_change_ngfft(Wfd, Cryst, Psps, new_ngfft)

 ! Recalculate FFT tables.
 ! Calculate the FFT index of $ R^{-1} (r-\tau) $ used to symmetrize u_Rk.
- ABI_REMALLOC(Wfd%irottb, (Wfd%nfftot,Cryst%nsym) )
+ ABI_REMALLOC(Wfd%irottb, (Wfd%nfftot,Cryst%nsym))
 call rotate_FFT_mesh(Cryst%nsym,Cryst%symrel,Cryst%tnons,Wfd%ngfft,Wfd%irottb,iscompatibleFFT)

 if (.not. iscompatibleFFT) then
--- a/src/72_response/m_ddb.F90
+++ b/src/72_response/m_ddb.F90
@ -6036,7 +6036,7 @@ subroutine ddb_to_dtset(comm, dtset, filename, psps)
 ABI_REMALLOC(dtset%spinat, (3,dtset%natom))
 dtset%spinat(:,:) = ddb_hdr%spinat(1:3,1:ddb_hdr%matom)

- ABI_REMALLOC(dtset%xred_orig, (3,dtset%natom,mxnimage) )
+ ABI_REMALLOC(dtset%xred_orig, (3,dtset%natom,mxnimage))
 dtset%xred_orig(:,:,1) = ddb_hdr%xred(1:3,1:ddb_hdr%matom)

 ABI_REMALLOC(dtset%ziontypat, (dtset%ntypat))
--- a/src/79_seqpar_mpi/m_chebfiwf.F90
+++ b/src/79_seqpar_mpi/m_chebfiwf.F90
@ -278,12 +278,6 @@ subroutine chebfiwf2(cg,dtset,eig,enl_out,gs_hamk,kinpw,mpi_enreg,&
 type(pawcprj_type) :: cprj_dum(gs_hamk%natom,1)

 #if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL)
- ! other
- integer(kind=c_size_t) :: l_pcon_size_bytes
-#endif
-
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL)
- ! other
 integer(kind=c_size_t) :: l_pcon_size_bytes
 #endif

@ -581,7 +575,6 @@ subroutine getghc_gsc1(X,AX,BX,transposer)
 !  ABI_MALLOC(l_gvnlxc,(2,blockdim*spacedim))
 !end if

-
 call multithreaded_getghc(l_cpopt,cg,cprj_dum,ghc,gsc,&
   l_gs_hamk,l_gvnlxc,eval,l_mpi_enreg,blockdim,l_prtvol,l_sij_opt,l_tim_getghc,0)

@ -590,14 +583,6 @@ subroutine getghc_gsc1(X,AX,BX,transposer)
 call gpu_device_synchronize()
 #endif

-
-#if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL)
- !if (chebfi%gpu_option==ABI_GPU_KOKKOS) then
-   call gpu_device_synchronize()
- !end if
-#endif
-
-
 !Scale cg, ghc, gsc
 if ( l_istwf == 2 ) then
   call xgBlock_scale(X ,sqrt2,1,gpu_option=l_gs_hamk%gpu_option)
@ -783,7 +768,6 @@ subroutine getBm1X(X,Bm1X,transposer)
   !cwaveprj_next is dummy
   if(gemm_nonlop_use_gemm) then
     ABI_MALLOC(cwaveprj_next, (1,1))
-   else
   else
     ABI_MALLOC(cwaveprj_next, (l_gs_hamk%natom,l_nspinor*blockdim))
     call pawcprj_alloc(cwaveprj_next,0,l_gs_hamk%dimcprj)
@ -859,11 +843,11 @@ subroutine getBm1X(X,Bm1X,transposer)
     end if
   end if
 end if
+
 if (l_paw) then
-   if (l_useria /= 121212) then
-     ABI_FREE(cwaveprj_next)
-   end if
+   ABI_FREE(cwaveprj_next)
 end if
+
 ABI_NVTX_END_RANGE()

 end subroutine getBm1X
--- a/src/79_seqpar_mpi/m_lobpcgwf.F90
+++ b/src/79_seqpar_mpi/m_lobpcgwf.F90
@ -122,7 +122,7 @@ subroutine lobpcgwf2(cg,dtset,eig,occ,enl_out,gs_hamk,isppol,ikpt,inonsc,istep,k

 ! Important things for NC
 integer,parameter :: choice=1, paw_opt=0, signs=1
- type(pawcprj_type) :: cprj_dum(gs_hamk%natom,1)
+ type(pawcprj_type) :: cprj_dum(1,1)
 integer :: iband, shift
 real(dp) :: gsc_dummy(0,0)
 real(dp), allocatable :: l_gvnlxc(:,:)
@ -355,7 +355,7 @@ end subroutine lobpcgwf2
  type(xgBlock_t), intent(inout) :: BX
  integer         :: blockdim
  integer         :: spacedim
-  type(pawcprj_type) :: cprj_dum(l_gs_hamk%natom,1)
+  type(pawcprj_type) :: cprj_dum(1,1)
  double precision :: dum
  double precision, parameter :: inv_sqrt2 = 1/sqrt2
  double precision, pointer :: cg(:,:)
--- a/src/79_seqpar_mpi/m_vtorho.F90
+++ b/src/79_seqpar_mpi/m_vtorho.F90
@ -1031,15 +1031,6 @@ subroutine vtorho(afford,atindx,atindx1,cg,compch_fft,cprj,cpus,dbl_nnsclo,&
                 gs_hamk%ph3d_k,gs_hamk%kpt_k,gs_hamk%kg_k,gs_hamk%kpg_k, &
                 compute_grad_atom=(optforces>0))
           end if
-             else
-               ABI_ERROR("istwfk > 2 is not handled with OpenMP GPU offload mode !")
-             end if
-             call make_gemm_nonlop_ompgpu(my_ikpt,gs_hamk%npw_fft_k,gs_hamk%lmnmax, &
-                 gs_hamk%ntypat, gs_hamk%indlmn, gs_hamk%nattyp, gs_hamk%istwf_k, &
-                 gs_hamk%ucvol, gs_hamk%ffnl_k, &
-                 gs_hamk%ph3d_k,gs_hamk%kpt_k,gs_hamk%kg_k,gs_hamk%kpg_k, &
-                 compute_grad_atom=(optforces>0))
-           end if
         end if
       end if

--- a/src/94_scfcv/m_outscfcv.F90
+++ b/src/94_scfcv/m_outscfcv.F90
@ -596,7 +596,7 @@ subroutine outscfcv(atindx1,cg,compch_fft,compch_sph,cprj,dimcprj,dmatpawu,dtfil

 ! Output of the GSR file (except when we are inside mover)
 #ifdef HAVE_NETCDF
-#if 0
+#ifndef FC_CRAY
 if (me == master .and. dtset%prtgsr == 1 .and. dtset%usewvl == 0) then
   !.and. (dtset%ionmov /= 0 .or. dtset%optcell /= 0)) then
   fname = strcat(dtfil%filnam_ds(4), "_GSR.nc")
--- a/src/95_drive/m_gstate.F90
+++ b/src/95_drive/m_gstate.F90
@ -456,9 +456,6 @@ subroutine gstate(args_gs,acell,codvsn,cpui,dtfil,dtset,iexit,initialized,&
   else if(dtset%gpu_option == ABI_GPU_DISABLED) then
     call init_gemm_nonlop(dtset%nkpt)
   end if
-   else if(dtset%gpu_option == ABI_GPU_DISABLED) then
-     call init_gemm_nonlop(dtset%nkpt)
-   end if
 end if

 gemm_nonlop_is_distributed = .false.
@ -1812,8 +1809,6 @@ subroutine gstate(args_gs,acell,codvsn,cpui,dtfil,dtset,iexit,initialized,&
     call destroy_gemm_nonlop(dtset%nkpt)
   else if(dtset%gpu_option==ABI_GPU_DISABLED) then
     call destroy_gemm_nonlop(dtset%nkpt)
-   else if(dtset%gpu_option==ABI_GPU_DISABLED) then
-     call destroy_gemm_nonlop(dtset%nkpt)
   end if
   gemm_nonlop_use_gemm = .false.
 end if
--- a/src/98_main/abinit.F90
+++ b/src/98_main/abinit.F90
@ -68,8 +68,9 @@

 #include "abi_common.h"

-! nvtx related macro definition
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
 #include "nvtx_macros.h"
+#endif

 program abinit

@ -385,7 +386,7 @@ program abinit
 end if
 #endif

-#ifdef HAVE_GPU_MARKERS
+#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
 NVTX_INIT(use_nvtx)
 #endif