mirror of https://github.com/abinit/abinit.git
Solve merge conflicts
This commit is contained in:
parent
420ed8ab0f
commit
bd2354497d
|
@ -366,6 +366,9 @@
|
|||
/* Define to 1 if you want to activate support for OpenMP GPU offload. */
|
||||
#cmakedefine HAVE_OPENMP_OFFLOAD @HAVE_OPENMP_OFFLOAD@
|
||||
|
||||
/* Define to 1 if you want to activate support for OpenMP GPU offload. */
|
||||
#cmakedefine HAVE_OPENMP_OFFLOAD @HAVE_OPENMP_OFFLOAD_DATASTRUCTURE@
|
||||
|
||||
/* Set to 1 if OpenMP has a working implementation of COLLAPSE. */
|
||||
#cmakedefine HAVE_OMP_COLLAPSE @HAVE_OMP_COLLAPSE@
|
||||
|
||||
|
|
|
@ -485,6 +485,8 @@ inline const char* _ConvertSMVer2ArchName(int major, int minor) {
|
|||
{0x75, "Turing"},
|
||||
{0x80, "Ampere"},
|
||||
{0x86, "Ampere"},
|
||||
{0x89, "AdaLovelace"},
|
||||
{0x90, "Hopper"},
|
||||
{-1, "Graphics Device"}};
|
||||
|
||||
int index = 0;
|
||||
|
|
|
@ -499,6 +499,8 @@ inline const char* _ConvertSMVer2ArchName(int major, int minor) {
|
|||
{0x75, "Turing"},
|
||||
{0x80, "Ampere"},
|
||||
{0x86, "Ampere"},
|
||||
{0x89, "AdaLovelace"},
|
||||
{0x90, "Hopper"},
|
||||
{-1, "Graphics Device"}};
|
||||
|
||||
int index = 0;
|
||||
|
|
|
@ -418,39 +418,24 @@ end subroutine Get_Mem_Dev
|
|||
#if defined HAVE_GPU
|
||||
|
||||
! Closing YAKL and Kokkos if opened
|
||||
if (gpu_option==ABI_GPU_KOKKOS .or. gpu_option==ABI_GPU_LEGACY) then
|
||||
if (gpu_option==ABI_GPU_KOKKOS) then
|
||||
#ifdef HAVE_YAKL
|
||||
call gator_finalize()
|
||||
write(std_out,*)'yakl gator finalized'
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_KOKKOS
|
||||
! finalize kokkos
|
||||
call kokkos_finalize()
|
||||
write(std_out,*)'kokkos finalized'
|
||||
#endif
|
||||
!kokkos_finalize already reset GPU context
|
||||
!if (gpu_option/=ABI_GPU_KOKKOS) call unset_dev()
|
||||
end if
|
||||
|
||||
! kokkos_finalize already reset GPU context
|
||||
!if (gpu_option/=ABI_GPU_KOKKOS) call unset_dev()
|
||||
|
||||
! Closing YAKL and Kokkos if opened
|
||||
if (gpu_option==ABI_GPU_KOKKOS .or. gpu_option==ABI_GPU_LEGACY) then
|
||||
#ifdef HAVE_YAKL
|
||||
call gator_finalize()
|
||||
write(std_out,*)'yakl gator finalized'
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_KOKKOS
|
||||
! finalize kokkos
|
||||
call kokkos_finalize()
|
||||
write(std_out,*)'kokkos finalized'
|
||||
#endif
|
||||
if (gpu_option==ABI_GPU_LEGACY) then
|
||||
call unset_dev()
|
||||
end if
|
||||
|
||||
! kokkos_finalize already reset GPU context
|
||||
!if (gpu_option/=ABI_GPU_KOKKOS) call unset_dev()
|
||||
|
||||
#endif
|
||||
end subroutine unsetdevice_cuda
|
||||
!!***
|
||||
|
|
|
@ -583,7 +583,7 @@ subroutine nctk_test_mpiio(print_warning)
|
|||
|
||||
!FIXME nf90create fails when using NVHPC
|
||||
! This might be due to my environment, maybe not, need to investigate this...
|
||||
#ifndef FC_NVHPC
|
||||
!!#ifndef FC_NVHPC
|
||||
#ifdef HAVE_NETCDF_MPI
|
||||
if (xmpi_comm_rank(xmpi_world) == master) then
|
||||
! Try to open a file with hdf5.
|
||||
|
@ -619,7 +619,7 @@ subroutine nctk_test_mpiio(print_warning)
|
|||
ABI_WARNING(msg)
|
||||
end if
|
||||
#endif
|
||||
#endif
|
||||
!!#endif
|
||||
|
||||
#ifdef HAVE_NETCDF_DEFAULT
|
||||
if (.not. nctk_has_mpiio) then
|
||||
|
|
|
@ -5002,7 +5002,7 @@ end subroutine abi_gpu_xcopy_2z
|
|||
!! b
|
||||
!!
|
||||
!! SIDE EFFECTS
|
||||
!! WARNING! : this routine is a dummy one when HAVE_GPU_CUDA is not enabled
|
||||
!! WARNING! : this routine is a dummy one when HAVE_GPU is not enabled
|
||||
!! the correct one is in 17_toolbox/gpu_linalg.cu
|
||||
!!
|
||||
!! SOURCE
|
||||
|
|
|
@ -204,11 +204,6 @@ end subroutine elpa_func_uninit
|
|||
!! INPUTS
|
||||
!! [blacs_ctx]= -- optional -- Blacs context
|
||||
!! [gpu]= -- optional -- Flag (0 or 1): use GPU version (currently only NVidia)
|
||||
!! na=Order of matrix A
|
||||
!! nblk=Blocksize of cyclic distribution, must be the same in both directions!
|
||||
!! local_nrows=Leading dimension of A
|
||||
!! local_ncols=Local columns of matrixes A and Q (eigenvectors)
|
||||
!! nev=Number of eigenvalues needed.
|
||||
!!
|
||||
!! SIDE EFFECTS
|
||||
!! elpa_hdl(type<elpa_hdl_t>)= ELPA handle
|
||||
|
@ -223,6 +218,7 @@ subroutine elpa_func_allocate(elpa_hdl,gpu,blacs_ctx)
|
|||
|
||||
!Local variables-------------------------------
|
||||
integer :: err,l_gpu,l_blacs_ctx
|
||||
logical :: gpu_debug_mode=.false.
|
||||
character(len=10) :: varname
|
||||
|
||||
! *********************************************************************
|
||||
|
@ -261,49 +257,30 @@ subroutine elpa_func_allocate(elpa_hdl,gpu,blacs_ctx)
|
|||
ABI_ERROR("You seem to use an old version of ELPA ( < 2021.x ) which only supports NVIDIA GPUs.")
|
||||
#endif
|
||||
end if
|
||||
|
||||
call elpa_func_error_handler(err_code=err,err_msg='Error when enabling GPU on ELPA')
|
||||
if (err==ELPA_OK) call elpa_hdl%elpa%set("debug",1,err)
|
||||
call elpa_func_error_handler(err_code=err,err_msg='Error when enabling debug on ELPA')
|
||||
end if
|
||||
#else
|
||||
if (err==0.and.l_gpu==1) elpa_hdl%gpu=l_gpu
|
||||
#endif
|
||||
|
||||
call elpa_func_error_handler(err_code=err,err_varname=varname)
|
||||
|
||||
call elpa_func_error_handler(err_code=err,err_msg='Error when enabling GPU on ELPA')
|
||||
|
||||
!if (err==ELPA_OK) call elpa_hdl%elpa%set("debug",1,err)
|
||||
!call elpa_func_error_handler(err_code=err,err_msg='Error when enabling debug on ELPA')
|
||||
if (gpu_debug_mode) then
|
||||
if (err==ELPA_OK) call elpa_hdl%elpa%set("debug",1,err)
|
||||
call elpa_func_error_handler(err_code=err,err_msg='Error when enabling debug on ELPA')
|
||||
end if
|
||||
|
||||
end if
|
||||
#else
|
||||
if (err==0.and.l_gpu==1) elpa_hdl%gpu=l_gpu
|
||||
if (err==0.and.l_gpu==1) then
|
||||
elpa_hdl%gpu=l_gpu
|
||||
if (gpu_debug_mode) elpa_hdl%debug=1
|
||||
end if
|
||||
#endif
|
||||
|
||||
call elpa_func_error_handler(err_code=err,err_varname=varname)
|
||||
|
||||
if (present(blacs_ctx)) then
|
||||
if (err==ELPA_OK) call elpa_hdl%elpa%set("blacs_context",int(blacs_ctx,kind=c_int),err)
|
||||
call elpa_func_error_handler(err_code=err,err_varname=varname)
|
||||
end if
|
||||
|
||||
elpa_hdl%is_allocated=.true.
|
||||
|
||||
! Setting matrix size
|
||||
call elpa_func_set_matrix(elpa_hdl,na,nblk,local_nrows,local_ncols,nev)
|
||||
|
||||
if (present(blacs_ctx)) then
|
||||
if (err==ELPA_OK) call elpa_hdl%elpa%set("blacs_context",int(blacs_ctx,kind=c_int),err)
|
||||
end if
|
||||
|
||||
! Proper ELPA setup
|
||||
err = elpa_hdl%elpa%setup()
|
||||
call elpa_func_error_handler(err_code=err,err_msg='Error during ELPA setup')
|
||||
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxEndRange()
|
||||
#endif
|
||||
|
||||
end subroutine elpa_func_allocate
|
||||
!!***
|
||||
|
||||
|
@ -453,6 +430,12 @@ subroutine elpa_func_get_communicators(elpa_hdl,mpi_comm_parent,process_row,proc
|
|||
varname='process_col'
|
||||
call elpa_hdl%elpa%set(trim(varname),process_col,err)
|
||||
end if
|
||||
if (err==ELPA_OK) then
|
||||
varname=''
|
||||
err = elpa_hdl%elpa%setup()
|
||||
call elpa_func_error_handler(err_code=err,err_msg='Error during ELPA setup')
|
||||
endif
|
||||
|
||||
#else
|
||||
elpa_hdl%mpi_comm_parent=mpi_comm_parent
|
||||
elpa_hdl%process_row=process_row
|
||||
|
@ -467,10 +450,13 @@ subroutine elpa_func_get_communicators(elpa_hdl,mpi_comm_parent,process_row,proc
|
|||
!ELPA-LEGACY-2017
|
||||
err=elpa_get_communicators(mpi_comm_parent,process_row,process_col,elpa_hdl%elpa_comm_rows,elpa_hdl%elpa_comm_cols)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
call elpa_func_error_handler(err_code=err,err_msg='Error in elpa_get_communicators',err_varname=varname)
|
||||
|
||||
elpa_hdl%is_allocated=.true.
|
||||
|
||||
end subroutine elpa_func_get_communicators
|
||||
!!***
|
||||
|
||||
|
|
|
@ -2687,7 +2687,7 @@ subroutine compute_eigen_problem(processor, matrix, results, eigen, comm, istwf_
|
|||
|
||||
call elpa_func_allocate(elpa_hdl,gpu=use_gpu_elpa_)
|
||||
call elpa_func_set_matrix(elpa_hdl,matrix%sizeb_global(1),matrix%sizeb_blocs(1),nev__,&
|
||||
& matrix%sizeb_local(1),matrix%sizeb_local(2),nev__,gpu=use_gpu)
|
||||
& matrix%sizeb_local(1),matrix%sizeb_local(2))
|
||||
call elpa_func_get_communicators(elpa_hdl,processor%comm,processor%coords(1),processor%coords(2))
|
||||
|
||||
if (istwf_k/=2) then
|
||||
|
@ -2912,6 +2912,10 @@ subroutine solve_gevp_complex(na,nev,na_rows,na_cols,nblk,a,b,ev,z,tmp1,tmp2, &
|
|||
if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxStartRange("solve_gevp_complex",12)
|
||||
#endif
|
||||
|
||||
! Allocate ELPA handle
|
||||
call elpa_func_allocate(elpa_hdl,blacs_ctx=sc_desc(CTXT_),gpu=use_gpu_elpa_)
|
||||
call elpa_func_set_matrix(elpa_hdl,na,nblk,nev,na_rows,na_cols)
|
||||
|
@ -2920,6 +2924,7 @@ subroutine solve_gevp_complex(na,nev,na_rows,na_cols,nblk,a,b,ev,z,tmp1,tmp2, &
|
|||
call elpa_func_solve_gevp_2stage(elpa_hdl,a,b,z,ev,nev)
|
||||
|
||||
call elpa_func_deallocate(elpa_hdl)
|
||||
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxEndRange()
|
||||
#endif
|
||||
|
@ -2957,6 +2962,10 @@ subroutine solve_gevp_real(na,nev,na_rows,na_cols,nblk,a,b,ev,z,tmp1,tmp2, &
|
|||
if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxStartRange("solve_gevp_real",12)
|
||||
#endif
|
||||
|
||||
! Allocate ELPA handle
|
||||
call elpa_func_allocate(elpa_hdl,blacs_ctx=sc_desc(CTXT_),gpu=use_gpu_elpa_)
|
||||
call elpa_func_set_matrix(elpa_hdl,na,nblk,nev,na_rows,na_cols)
|
||||
|
@ -2999,6 +3008,10 @@ subroutine solve_gevp_real(na,nev,na_rows,na_cols,nblk,a,b,ev,z,tmp1,tmp2, &
|
|||
|
||||
call elpa_func_deallocate(elpa_hdl)
|
||||
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxEndRange()
|
||||
#endif
|
||||
|
||||
end subroutine solve_gevp_real
|
||||
!!***
|
||||
#endif
|
||||
|
@ -3050,6 +3063,7 @@ subroutine compute_generalized_eigen_problem(processor,matrix1,matrix2,results,e
|
|||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxStartRange("slk_compute_generalized_eigen", 10)
|
||||
#endif
|
||||
|
||||
nev__ = matrix1%sizeb_global(2); if (present(nev)) nev__ = nev
|
||||
use_gpu_elpa__ = 0
|
||||
#ifdef HAVE_LINALG_ELPA
|
||||
|
@ -3082,6 +3096,7 @@ subroutine compute_generalized_eigen_problem(processor,matrix1,matrix2,results,e
|
|||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxEndRange()
|
||||
#endif
|
||||
|
||||
#else
|
||||
!Arguments ------------------------------------
|
||||
class(processor_scalapack),intent(in) :: processor
|
||||
|
@ -3292,6 +3307,10 @@ subroutine compute_eigen1(comm,processor,cplex,nbli_global,nbco_global,matrix,ve
|
|||
if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxStartRange("slk_compute_eigen1", 7)
|
||||
#endif
|
||||
|
||||
! ================================
|
||||
! INITIALISATION SCALAPACK MATRIX
|
||||
! ================================
|
||||
|
@ -3358,6 +3377,10 @@ subroutine compute_eigen1(comm,processor,cplex,nbli_global,nbco_global,matrix,ve
|
|||
ABI_SFREE(z_tmp_evec)
|
||||
ABI_SFREE(r_tmp_evec)
|
||||
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxEndRange()
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_LINALG_ELPA
|
||||
ABI_UNUSED(use_gpu_elpa)
|
||||
#endif
|
||||
|
@ -3423,6 +3446,10 @@ subroutine compute_eigen2(comm,processor,cplex,nbli_global,nbco_global,matrix1,m
|
|||
if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxStartRange("slk_compute_eigen2", 7)
|
||||
#endif
|
||||
|
||||
! ================================
|
||||
! INITIALISATION SCALAPACK MATRIX
|
||||
! ================================
|
||||
|
@ -3495,13 +3522,14 @@ subroutine compute_eigen2(comm,processor,cplex,nbli_global,nbco_global,matrix1,m
|
|||
call sca_matrix2%free()
|
||||
call sca_matrix3%free()
|
||||
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxEndRange()
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_LINALG_ELPA
|
||||
ABI_UNUSED(use_gpu_elpa)
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxEndRange()
|
||||
#endif
|
||||
end subroutine compute_eigen2
|
||||
!!***
|
||||
|
||||
|
|
|
@ -29,10 +29,14 @@ cpp_options:
|
|||
- DEV_YP_DEBUG_PSP
|
||||
- DEV_YP_VDWXC
|
||||
- FC_ABSOFT
|
||||
- FC_CRAY
|
||||
- FC_FLANG
|
||||
- FC_GNU
|
||||
- FC_IBM
|
||||
- FC_INTEL
|
||||
- FC_LLVM
|
||||
- FC_NAG
|
||||
- FC_NVHPC
|
||||
- FC_PGI
|
||||
- FFT_PRECISION
|
||||
- GPU_FOUR_HEADER_H
|
||||
|
@ -88,6 +92,7 @@ cpp_options:
|
|||
- HAVE_GPU_CUDA_DP
|
||||
- HAVE_GPU_CUDA_SP
|
||||
- HAVE_GPU_CUDA_TM
|
||||
- HAVE_GPU_HIP
|
||||
- HAVE_GPU_MPI
|
||||
- HAVE_GPU_SERIAL
|
||||
- HAVE_GW_DPC
|
||||
|
@ -154,6 +159,8 @@ cpp_options:
|
|||
- HAVE_NUMPY
|
||||
- HAVE_OMP_COLLAPSE
|
||||
- HAVE_OPENMP
|
||||
- HAVE_OPENMP_OFFLOAD
|
||||
- HAVE_OPENMP_OFFLOAD_DATASTRUCTURE
|
||||
- HAVE_OS_LINUX
|
||||
- HAVE_OS_MACOSX
|
||||
- HAVE_OS_WINDOWS
|
||||
|
|
|
@ -124,7 +124,6 @@ type, public :: dataset_type
|
|||
integer :: diismemory
|
||||
integer :: dipdip = 1
|
||||
integer :: dipquad = 1
|
||||
integer :: distribute_gemm_nonlop = 0
|
||||
integer :: dmatpuopt
|
||||
integer :: dmatudiag
|
||||
integer :: dmft_dc
|
||||
|
@ -202,7 +201,6 @@ type, public :: dataset_type
|
|||
integer :: ga_algor
|
||||
integer :: ga_fitness
|
||||
integer :: ga_n_rules
|
||||
integer :: gemm_nonlop_split_size = 1
|
||||
integer :: getcell = 0
|
||||
integer :: getddb = 0
|
||||
integer :: getdvdb = 0
|
||||
|
@ -623,7 +621,6 @@ type, public :: dataset_type
|
|||
integer :: tl_nprccg
|
||||
!U
|
||||
integer :: ucrpa
|
||||
integer :: use_gpu_openmp_threads
|
||||
integer :: usedmatpu
|
||||
integer :: usedmft
|
||||
integer :: useexexch
|
||||
|
@ -1445,7 +1442,6 @@ type(dataset_type) function dtset_copy(dtin) result(dtout)
|
|||
dtout%delayperm = dtin%delayperm
|
||||
dtout%diismemory = dtin%diismemory
|
||||
dtout%dipquad = dtin%dipquad
|
||||
dtout%distribute_gemm_nonlop = dtin%distribute_gemm_nonlop
|
||||
dtout%dmatpuopt = dtin%dmatpuopt
|
||||
dtout%dmatudiag = dtin%dmatudiag
|
||||
dtout%dmft_dc = dtin%dmft_dc
|
||||
|
@ -1578,7 +1574,6 @@ type(dataset_type) function dtset_copy(dtin) result(dtout)
|
|||
dtout%ga_algor = dtin%ga_algor
|
||||
dtout%ga_fitness = dtin%ga_fitness
|
||||
dtout%ga_n_rules = dtin%ga_n_rules
|
||||
dtout%gemm_nonlop_split_size = dtin%gemm_nonlop_split_size
|
||||
dtout%getbseig = dtin%getbseig
|
||||
dtout%getbsreso = dtin%getbsreso
|
||||
dtout%getbscoup = dtin%getbscoup
|
||||
|
@ -1994,7 +1989,6 @@ type(dataset_type) function dtset_copy(dtin) result(dtout)
|
|||
dtout%tim1rev = dtin%tim1rev
|
||||
dtout%timopt = dtin%timopt
|
||||
dtout%use_gemm_nonlop = dtin%use_gemm_nonlop
|
||||
dtout%use_gpu_openmp_threads = dtin%use_gpu_openmp_threads
|
||||
dtout%useextfpmd = dtin%useextfpmd
|
||||
dtout%use_yaml = dtin%use_yaml ! This variable activates the Yaml output for testing purposes
|
||||
! It will be removed when Yaml output enters production.
|
||||
|
@ -3317,7 +3311,7 @@ subroutine chkvars(string)
|
|||
list_vars=trim(list_vars)//' delayperm densfor_pred densty dfield'
|
||||
list_vars=trim(list_vars)//' dfpt_sciss diecut diegap dielam dielng diemac'
|
||||
list_vars=trim(list_vars)//' diemix diemixmag diismemory'
|
||||
list_vars=trim(list_vars)//' dilatmx dipdip dipquad dipdip_prt dipdip_range distribute_gemm_nonlop'
|
||||
list_vars=trim(list_vars)//' dilatmx dipdip dipquad dipdip_prt dipdip_range'
|
||||
list_vars=trim(list_vars)//' dmatpawu dmatpuopt dmatudiag'
|
||||
list_vars=trim(list_vars)//' dmftbandi dmftbandf dmftctqmc_basis'
|
||||
list_vars=trim(list_vars)//' dmftctqmc_check dmftctqmc_correl dmftctqmc_gmove'
|
||||
|
@ -3365,7 +3359,7 @@ subroutine chkvars(string)
|
|||
list_vars=trim(list_vars)//' f4of2_sla f6of2_sla'
|
||||
!G
|
||||
list_vars=trim(list_vars)//' ga_algor ga_fitness ga_n_rules ga_opt_percent ga_rules'
|
||||
list_vars=trim(list_vars)//' gemm_nonlop_split_size genafm getbscoup getbseig getbsreso getcell'
|
||||
list_vars=trim(list_vars)//' genafm getbscoup getbseig getbsreso getcell'
|
||||
list_vars=trim(list_vars)//' getddb getddb_filepath getden_filepath getddk'
|
||||
list_vars=trim(list_vars)//' getdelfd getdkdk getdkde getden getkden getdvdb getdvdb_filepath'
|
||||
list_vars=trim(list_vars)//' getefmas getkerange_filepath getgam_eig2nkq'
|
||||
|
@ -3539,7 +3533,6 @@ subroutine chkvars(string)
|
|||
list_vars=trim(list_vars)//' userra userrb userrc userrd userre'
|
||||
list_vars=trim(list_vars)//' usewvl usexcnhat useylm use_gemm_nonlop'
|
||||
list_vars=trim(list_vars)//' use_slk useextfpmd use_yaml'
|
||||
list_vars=trim(list_vars)//' use_slk useextfpmd use_yaml'
|
||||
list_vars=trim(list_vars)//' use_oldchi'
|
||||
!V
|
||||
list_vars=trim(list_vars)//' vaclst vacnum vacuum vacwidth vcutgeo'
|
||||
|
|
|
@ -605,7 +605,7 @@ module m_xgTransposer
|
|||
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||
#endif
|
||||
|
||||
if( xgTransposer%gou_option == ABI_GPU_KOKKOS) then
|
||||
if( xgTransposer%gpu_option == ABI_GPU_KOKKOS) then
|
||||
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_KOKKOS) && defined(HAVE_YAKL)
|
||||
call timab(tim_all2allv,1,tsec)
|
||||
|
@ -621,7 +621,6 @@ module m_xgTransposer
|
|||
recvbuf(:,:) = recvbuf_mpi(:,:)
|
||||
|
||||
ABI_FREE(recvbuf_mpi)
|
||||
|
||||
#endif
|
||||
|
||||
else
|
||||
|
|
|
@ -205,13 +205,12 @@ subroutine ompgpu_fourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,ist
|
|||
|
||||
cfft_size = 2*n1*n2*n3*ndat
|
||||
|
||||
#ifdef HAVE_GPU_CUDA
|
||||
#if defined HAVE_GPU_CUDA
|
||||
byte_count=sizeof(work_gpu)
|
||||
!$OMP TARGET DATA USE_DEVICE_PTR(work_gpu)
|
||||
call gpu_memset(c_loc(work_gpu), 0, byte_count)
|
||||
!$OMP END TARGET DATA
|
||||
#endif
|
||||
#ifdef HAVE_GPU_HIP
|
||||
#elif defined HAVE_GPU_HIP
|
||||
!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(i1,i2,i3) MAP(to:work_gpu)
|
||||
do i3=1,n3*ndat
|
||||
do i2=1,n2
|
||||
|
@ -266,12 +265,11 @@ subroutine ompgpu_fourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,ist
|
|||
i1=kg_kin(1,ipw); if(i1<0)i1=i1+n1;
|
||||
i2=kg_kin(2,ipw); if(i2<0)i2=i2+n2;
|
||||
i3=kg_kin(3,ipw); if(i3<0)i3=i3+n3;
|
||||
#ifdef HAVE_GPU_CUDA
|
||||
#if defined HAVE_GPU_CUDA
|
||||
i1inv = modulo(shift_inv1 - i1, n1) + 1
|
||||
i2inv = modulo(shift_inv2 - i2, n2) + 1
|
||||
i3inv = modulo(shift_inv3 - i3, n3) + 1
|
||||
#endif
|
||||
#ifdef HAVE_GPU_HIP
|
||||
#elif defined HAVE_GPU_HIP
|
||||
i1inv = (shift_inv1-i1) - ( ((shift_inv1-i1)/n1) * n1 ) + 1
|
||||
i2inv = (shift_inv2-i2) - ( ((shift_inv2-i2)/n2) * n2 ) + 1
|
||||
i3inv = (shift_inv3-i3) - ( ((shift_inv3-i3)/n3) * n3 ) + 1
|
||||
|
|
|
@ -1513,7 +1513,7 @@ subroutine timana(mpi_enreg,natom,nband,ndtset,nfft,nkpt,npwtot,nsppol,timopt)
|
|||
|
||||
percent_limit=0.5_dp
|
||||
if (timopt<0) percent_limit=0.0001_dp
|
||||
if (timopt<0) percent_limit=tol12
|
||||
!if (timopt<0) percent_limit=tol12
|
||||
|
||||
!In case there is parallelism, report times for node 0
|
||||
!if (me==0 .and. nproc>1) then
|
||||
|
@ -1591,10 +1591,10 @@ subroutine timana(mpi_enreg,natom,nband,ndtset,nfft,nkpt,npwtot,nsppol,timopt)
|
|||
end if
|
||||
|
||||
!Now, gather all information
|
||||
!call xmpi_sum(times,spaceworld,ierr)
|
||||
!call xmpi_sum(ncount,spaceworld,ierr)
|
||||
!call xmpi_sum(ftimes,spaceworld,ierr)
|
||||
!call xmpi_sum(nflops,spaceworld,ierr)
|
||||
call xmpi_sum(times,spaceworld,ierr)
|
||||
call xmpi_sum(ncount,spaceworld,ierr)
|
||||
call xmpi_sum(ftimes,spaceworld,ierr)
|
||||
call xmpi_sum(nflops,spaceworld,ierr)
|
||||
|
||||
if (me==0) then ! Only the world master writes
|
||||
|
||||
|
|
|
@ -2295,7 +2295,6 @@ subroutine indefo(dtsets, ndtset_alloc, nprocs)
|
|||
dtsets(idtset)%dielam=half
|
||||
dtsets(idtset)%diismemory=8
|
||||
dtsets(idtset)%dilatmx=one
|
||||
dtsets(idtset)%distribute_gemm_nonlop=0
|
||||
dtsets(idtset)%dmatpuopt=2
|
||||
if (size(dtsets(idtset)%dmatpawu,4)>0) dtsets(idtset)%dmatpawu=-10._dp
|
||||
dtsets(idtset)%dmatudiag=0
|
||||
|
|
|
@ -257,10 +257,10 @@ SUBROUTINE BathOperatoroffdiag_init(op, flavors, samples, beta, iTech,opt_nondia
|
|||
FREEIF(op%F)
|
||||
MALLOC(op%F,(1:op%sizeHybrid+1,1:flavors,1:flavors))
|
||||
DT_FREEIF(op%tails)
|
||||
DT_MALLOC(op%tails, (1:op%flavors))
|
||||
DT_MALLOC(op%tails,(1:op%flavors))
|
||||
op%tails=0
|
||||
DT_FREEIF(op%Fshift)
|
||||
DT_MALLOC(op%Fshift, (1:op%flavors+1))
|
||||
DT_MALLOC(op%Fshift,(1:op%flavors+1))
|
||||
op%Fshift=0
|
||||
|
||||
CALL Vector_init(op%R,100*op%flavors)
|
||||
|
|
|
@ -643,7 +643,7 @@ SUBROUTINE Ctqmc_allocateAll(this)
|
|||
this%measDE = 0.d0
|
||||
|
||||
FREEIF(this%mu)
|
||||
MALLOC(this%mu, (1:flavors) )
|
||||
MALLOC(this%mu,(1:flavors) )
|
||||
this%mu = 0.d0
|
||||
END SUBROUTINE Ctqmc_allocateAll
|
||||
!!***
|
||||
|
|
|
@ -665,7 +665,7 @@ SUBROUTINE Ctqmcoffdiag_allocateAll(op)
|
|||
op%measDE = 0.d0
|
||||
|
||||
FREEIF(op%mu)
|
||||
MALLOC(op%mu, (1:flavors) )
|
||||
MALLOC(op%mu,(1:flavors) )
|
||||
op%mu = 0.d0
|
||||
FREEIF(op%hybri_limit)
|
||||
MALLOC(op%hybri_limit, (flavors,flavors) )
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
!! which leads to excellent CPU efficiency and OpenMP scalability.
|
||||
!!
|
||||
!! COPYRIGHT
|
||||
!! Copyright (C) 2014-2022 ABINIT group (AL)
|
||||
!! Copyright (C) 2014-2022 ABINIT group (MS)
|
||||
!! This file is distributed under the terms of the
|
||||
!! GNU General Public License, see ~abinit/COPYING
|
||||
!! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
|
@ -360,7 +360,7 @@ contains
|
|||
end if
|
||||
|
||||
if(allocated(temp_realvec_r)) then
|
||||
!$OMP TARGET EXIT DATA MAP(delete:temp_realvec_r,temp_realvec_i)
|
||||
!$OMP TARGET EXIT DATA MAP(delete:kwa m_hamiltonian.F90,temp_realvec_i)
|
||||
ABI_FREE(temp_realvec_r)
|
||||
ABI_FREE(temp_realvec_i)
|
||||
end if
|
||||
|
@ -800,7 +800,7 @@ contains
|
|||
real(dp),intent(inout),target :: vectin(2,npwin*nspinor*ndat)
|
||||
real(dp),intent(inout) :: enlout(nnlout*ndat)
|
||||
real(dp),intent(out),target :: svectout(:,:)
|
||||
real(dp),intent(inout),target :: vectout(:,:) !vz_i
|
||||
real(dp),intent(inout),target :: vectout(:,:)
|
||||
real(dp),intent(inout),optional, ABI_CONTIGUOUS target :: vectproj(:,:,:)
|
||||
type(pawcprj_type),intent(inout) :: cprjin(natom,nspinor*((cpopt+5)/5)*ndat)
|
||||
|
||||
|
@ -826,9 +826,7 @@ contains
|
|||
character(len=500) :: msg
|
||||
integer(C_SIZE_T) :: byte_count
|
||||
#ifdef HAVE_GPU_HIP
|
||||
type(c_ptr) :: vectin_amdcopy
|
||||
type(c_ptr) :: vectout_amdcopy
|
||||
type(c_ptr) :: svectout_amdcopy
|
||||
type(c_ptr) :: vectin_amdcopy,vectout_amdcopy,svectout_amdcopy
|
||||
#endif
|
||||
|
||||
! *************************************************************************
|
||||
|
|
|
@ -191,7 +191,6 @@ module m_hamiltonian
|
|||
! Governs the choice of the GPU implementation:
|
||||
! = 0 ==> do not use GPU
|
||||
! > 0 ==> see defs_basis.F90 to have the list of possible GPU implementations
|
||||
! = 666 ==> use openMP GPU implementation of hamiltonian operators
|
||||
|
||||
integer :: usecprj
|
||||
! usecprj= 1 if cprj projected WF are stored in memory
|
||||
|
|
|
@ -759,6 +759,7 @@ has_fock=.false.
|
|||
#ifndef HAVE_GPU_HIP
|
||||
!$OMP TARGET EXIT DATA MAP(delete:work)
|
||||
#endif
|
||||
|
||||
end if ! type_calc
|
||||
ABI_NVTX_END_RANGE()
|
||||
|
||||
|
@ -768,6 +769,7 @@ has_fock=.false.
|
|||
!============================================================
|
||||
! Application of the non-local potential and the Fock potential
|
||||
!============================================================
|
||||
|
||||
ABI_NVTX_START_RANGE(NVTX_GETGHC_NLOCPOT)
|
||||
if (type_calc==0 .or. type_calc==2) then
|
||||
signs=2 ; choice=1 ; nnlout=1 ; idir=0 ; tim_nonlop=1
|
||||
|
|
|
@ -1467,8 +1467,7 @@ subroutine solve_inner_ompgpu(invovl, ham, cplx, mpi_enreg, proj, ndat, sm1proj,
|
|||
integer :: additional_steps_to_take,idat,iproj,icplx
|
||||
integer :: Ptsize(3)
|
||||
#ifdef HAVE_GPU_HIP
|
||||
type(c_ptr) :: sm1proj_amdcopy
|
||||
type(c_ptr) :: PtPsm1proj_amdcopy
|
||||
type(c_ptr) :: sm1proj_amdcopy,PtPsm1proj_amdcopy
|
||||
#endif
|
||||
|
||||
! *************************************************************************
|
||||
|
|
|
@ -53,7 +53,7 @@ module m_prep_kgb
|
|||
use m_ompgpu_fourwf
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
use m_nvtx
|
||||
#endif
|
||||
|
||||
|
@ -295,7 +295,7 @@ subroutine prep_getghc(cwavef, gs_hamk, gvnlxc, gwavef, swavef, lambda, blocksiz
|
|||
|
||||
if(do_transpose) then
|
||||
call timab(545,3,tsec)
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||
#endif
|
||||
if ( ((.not.flag_inv_sym) .and. bandpp==1 .and. mpi_enreg%paral_spinor==0 .and. my_nspinor==2 ).or. &
|
||||
|
@ -306,7 +306,7 @@ subroutine prep_getghc(cwavef, gs_hamk, gvnlxc, gwavef, swavef, lambda, blocksiz
|
|||
call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,&
|
||||
& recvcountsloc,rdisplsloc,spaceComm,ier)
|
||||
end if
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxEndRange()
|
||||
#endif
|
||||
call timab(545,2,tsec)
|
||||
|
@ -527,56 +527,56 @@ subroutine prep_getghc(cwavef, gs_hamk, gvnlxc, gwavef, swavef, lambda, blocksiz
|
|||
if ( ((.not.flag_inv_sym) .and. bandpp==1 .and. mpi_enreg%paral_spinor==0 .and. my_nspinor==2 ).or. &
|
||||
& ((.not.flag_inv_sym) .and. bandpp>1) .or. flag_inv_sym ) then
|
||||
if (sij_opt==1) then
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||
#endif
|
||||
call xmpi_alltoallv(swavef_alltoall1,recvcountsloc,rdisplsloc,swavef,&
|
||||
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxEndRange()
|
||||
#endif
|
||||
end if
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||
#endif
|
||||
if (.not.local_gvnlxc) call xmpi_alltoallv(gvnlxc_alltoall1,recvcountsloc,rdisplsloc,gvnlxc,&
|
||||
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxEndRange()
|
||||
#endif
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||
#endif
|
||||
call xmpi_alltoallv(gwavef_alltoall1,recvcountsloc,rdisplsloc,gwavef,&
|
||||
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxEndRange()
|
||||
#endif
|
||||
else
|
||||
if (sij_opt==1) then
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||
#endif
|
||||
call xmpi_alltoallv(swavef_alltoall2,recvcountsloc,rdisplsloc,swavef,&
|
||||
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxEndRange()
|
||||
#endif
|
||||
end if
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||
#endif
|
||||
if (.not.local_gvnlxc) call xmpi_alltoallv(gvnlxc_alltoall2,recvcountsloc,rdisplsloc,gvnlxc,&
|
||||
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxEndRange()
|
||||
#endif
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||
#endif
|
||||
call xmpi_alltoallv(gwavef_alltoall2,recvcountsloc,rdisplsloc,gwavef,&
|
||||
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxEndRange()
|
||||
#endif
|
||||
end if
|
||||
|
@ -860,7 +860,7 @@ subroutine prep_nonlop(choice,cpopt,cwaveprj,enlout_block,hamk,idir,lambdablock,
|
|||
|
||||
if(do_transpose) then
|
||||
call timab(581,1,tsec)
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||
#endif
|
||||
if (bandpp/=1 .or. (bandpp==1 .and. mpi_enreg%paral_spinor==0.and.nspinortot==2)) then
|
||||
|
@ -892,7 +892,7 @@ subroutine prep_nonlop(choice,cpopt,cwaveprj,enlout_block,hamk,idir,lambdablock,
|
|||
call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,&
|
||||
& recvcountsloc,rdisplsloc,spaceComm,ier)
|
||||
end if
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxEndRange()
|
||||
#endif
|
||||
call timab(581,2,tsec)
|
||||
|
@ -1000,43 +1000,43 @@ subroutine prep_nonlop(choice,cpopt,cwaveprj,enlout_block,hamk,idir,lambdablock,
|
|||
call timab(581,1,tsec)
|
||||
if(bandpp/=1 .or. (bandpp==1 .and. mpi_enreg%paral_spinor==0.and.nspinortot==2))then
|
||||
if (paw_opt/=3) then
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||
#endif
|
||||
call xmpi_alltoallv(gvnlc_alltoall1,recvcountsloc,rdisplsloc,gvnlc,&
|
||||
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxEndRange()
|
||||
#endif
|
||||
end if
|
||||
if (paw_opt==3.or.paw_opt==4) then
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||
#endif
|
||||
call xmpi_alltoallv(gsc_alltoall1,recvcountsloc,rdisplsloc,gsc,&
|
||||
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxEndRange()
|
||||
#endif
|
||||
end if
|
||||
else
|
||||
if (paw_opt/=3) then
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||
#endif
|
||||
call xmpi_alltoallv(gvnlc_alltoall2,recvcountsloc,rdisplsloc,gvnlc,&
|
||||
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxEndRange()
|
||||
#endif
|
||||
end if
|
||||
if (paw_opt==3.or.paw_opt==4) then
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||
#endif
|
||||
call xmpi_alltoallv(gsc_alltoall2,recvcountsloc,rdisplsloc,gsc,&
|
||||
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxEndRange()
|
||||
#endif
|
||||
end if
|
||||
|
@ -1282,7 +1282,7 @@ subroutine prep_fourwf(rhoaug,blocksize,cwavef,wfraug,iblock,istwf_k,mgfft,&
|
|||
sdisplsloc(:)=sdispls(:)*2
|
||||
|
||||
call timab(547,1,tsec)
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||
#endif
|
||||
#if defined HAVE_GPU && defined HAVE_YAKL
|
||||
|
@ -1305,7 +1305,7 @@ subroutine prep_fourwf(rhoaug,blocksize,cwavef,wfraug,iblock,istwf_k,mgfft,&
|
|||
call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,&
|
||||
& recvcountsloc,rdisplsloc,spaceComm,ier)
|
||||
#endif
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
call nvtxEndRange()
|
||||
#endif
|
||||
call timab(547,2,tsec)
|
||||
|
|
|
@ -975,13 +975,6 @@ subroutine forstrnps(cg,cprj,ecut,ecutsm,effmass_free,eigen,electronpositron,foc
|
|||
gs_hamk%ucvol, gs_hamk%ffnl_k, gs_hamk%ph3d_k, gs_hamk%kpt_k, &
|
||||
gs_hamk%kg_k, gs_hamk%kpg_k, &
|
||||
compute_grad_strain=(stress_needed>0),compute_grad_atom=(optfor>0))
|
||||
!!FIXME signs==1 not handled in CUDA GEMM nonlop
|
||||
!else if ( gpu_option /= ABI_GPU_LEGACY) then
|
||||
! call make_gemm_nonlop_ompgpu(my_ikpt,gs_hamk%npw_fft_k,gs_hamk%lmnmax, &
|
||||
! gs_hamk%ntypat, gs_hamk%indlmn, gs_hamk%nattyp, gs_hamk%istwf_k, &
|
||||
! gs_hamk%ucvol, gs_hamk%ffnl_k, gs_hamk%ph3d_k, gs_hamk%kpt_k, &
|
||||
! gs_hamk%kg_k, gs_hamk%kpg_k, &
|
||||
! compute_grad_strain=(stress_needed>0),compute_grad_atom=(optfor>0))
|
||||
else if ( gpu_option == ABI_GPU_OPENMP) then
|
||||
call make_gemm_nonlop_ompgpu(my_ikpt,gs_hamk%npw_fft_k,gs_hamk%lmnmax, &
|
||||
gs_hamk%ntypat, gs_hamk%indlmn, gs_hamk%nattyp, gs_hamk%istwf_k, &
|
||||
|
@ -989,15 +982,6 @@ subroutine forstrnps(cg,cprj,ecut,ecutsm,effmass_free,eigen,electronpositron,foc
|
|||
gs_hamk%kg_k, gs_hamk%kpg_k, &
|
||||
compute_grad_strain=(stress_needed>0),compute_grad_atom=(optfor>0))
|
||||
end if
|
||||
else
|
||||
ABI_ERROR("istwfk > 2 is not handled with OpenMP GPU offload mode !")
|
||||
end if
|
||||
call make_gemm_nonlop_ompgpu(my_ikpt,gs_hamk%npw_fft_k,gs_hamk%lmnmax, &
|
||||
gs_hamk%ntypat, gs_hamk%indlmn, gs_hamk%nattyp, gs_hamk%istwf_k, &
|
||||
gs_hamk%ucvol, gs_hamk%ffnl_k, gs_hamk%ph3d_k, gs_hamk%kpt_k, &
|
||||
gs_hamk%kg_k, gs_hamk%kpg_k, &
|
||||
compute_grad_strain=(stress_needed>0),compute_grad_atom=(optfor>0))
|
||||
end if
|
||||
end if
|
||||
|
||||
! Loop over (blocks of) bands; accumulate forces and/or stresses
|
||||
|
|
|
@ -180,6 +180,10 @@ subroutine mkrho(cg,dtset,gprimd,irrzon,kg,mcg,mpi_enreg,npwarr,occ,paw_dmft,phn
|
|||
!arrays
|
||||
integer,allocatable :: gbound(:,:)
|
||||
logical :: locc_test,nspinor1TreatedByThisProc,nspinor2TreatedByThisProc
|
||||
real(dp),allocatable :: occ_diag(:),cwavef_rot(:,:,:,:)
|
||||
#if defined HAVE_GPUL
|
||||
real(dp),allocatable :: weight_t(:) ! only allocated and used when use_gpu_cuda = 1
|
||||
#endif
|
||||
#if defined HAVE_GPU && defined HAVE_YAKL
|
||||
integer(int32),ABI_CONTIGUOUS pointer :: kg_k(:,:) => null()
|
||||
real(real64) :: dummy(2,1) = reshape( (/0.0, 0.0/), shape(dummy))
|
||||
|
|
|
@ -3718,7 +3718,7 @@ subroutine wfd_change_ngfft(Wfd, Cryst, Psps, new_ngfft)
|
|||
|
||||
! Recalculate FFT tables.
|
||||
! Calculate the FFT index of $ R^{-1} (r-\tau) $ used to symmetrize u_Rk.
|
||||
ABI_REMALLOC(Wfd%irottb, (Wfd%nfftot,Cryst%nsym) )
|
||||
ABI_REMALLOC(Wfd%irottb, (Wfd%nfftot,Cryst%nsym))
|
||||
call rotate_FFT_mesh(Cryst%nsym,Cryst%symrel,Cryst%tnons,Wfd%ngfft,Wfd%irottb,iscompatibleFFT)
|
||||
|
||||
if (.not. iscompatibleFFT) then
|
||||
|
|
|
@ -6036,7 +6036,7 @@ subroutine ddb_to_dtset(comm, dtset, filename, psps)
|
|||
ABI_REMALLOC(dtset%spinat, (3,dtset%natom))
|
||||
dtset%spinat(:,:) = ddb_hdr%spinat(1:3,1:ddb_hdr%matom)
|
||||
|
||||
ABI_REMALLOC(dtset%xred_orig, (3,dtset%natom,mxnimage) )
|
||||
ABI_REMALLOC(dtset%xred_orig, (3,dtset%natom,mxnimage))
|
||||
dtset%xred_orig(:,:,1) = ddb_hdr%xred(1:3,1:ddb_hdr%matom)
|
||||
|
||||
ABI_REMALLOC(dtset%ziontypat, (dtset%ntypat))
|
||||
|
|
|
@ -278,12 +278,6 @@ subroutine chebfiwf2(cg,dtset,eig,enl_out,gs_hamk,kinpw,mpi_enreg,&
|
|||
type(pawcprj_type) :: cprj_dum(gs_hamk%natom,1)
|
||||
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL)
|
||||
! other
|
||||
integer(kind=c_size_t) :: l_pcon_size_bytes
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL)
|
||||
! other
|
||||
integer(kind=c_size_t) :: l_pcon_size_bytes
|
||||
#endif
|
||||
|
||||
|
@ -581,7 +575,6 @@ subroutine getghc_gsc1(X,AX,BX,transposer)
|
|||
! ABI_MALLOC(l_gvnlxc,(2,blockdim*spacedim))
|
||||
!end if
|
||||
|
||||
|
||||
call multithreaded_getghc(l_cpopt,cg,cprj_dum,ghc,gsc,&
|
||||
l_gs_hamk,l_gvnlxc,eval,l_mpi_enreg,blockdim,l_prtvol,l_sij_opt,l_tim_getghc,0)
|
||||
|
||||
|
@ -590,14 +583,6 @@ subroutine getghc_gsc1(X,AX,BX,transposer)
|
|||
call gpu_device_synchronize()
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL)
|
||||
!if (chebfi%gpu_option==ABI_GPU_KOKKOS) then
|
||||
call gpu_device_synchronize()
|
||||
!end if
|
||||
#endif
|
||||
|
||||
|
||||
!Scale cg, ghc, gsc
|
||||
if ( l_istwf == 2 ) then
|
||||
call xgBlock_scale(X ,sqrt2,1,gpu_option=l_gs_hamk%gpu_option)
|
||||
|
@ -783,7 +768,6 @@ subroutine getBm1X(X,Bm1X,transposer)
|
|||
!cwaveprj_next is dummy
|
||||
if(gemm_nonlop_use_gemm) then
|
||||
ABI_MALLOC(cwaveprj_next, (1,1))
|
||||
else
|
||||
else
|
||||
ABI_MALLOC(cwaveprj_next, (l_gs_hamk%natom,l_nspinor*blockdim))
|
||||
call pawcprj_alloc(cwaveprj_next,0,l_gs_hamk%dimcprj)
|
||||
|
@ -859,11 +843,11 @@ subroutine getBm1X(X,Bm1X,transposer)
|
|||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
if (l_paw) then
|
||||
if (l_useria /= 121212) then
|
||||
ABI_FREE(cwaveprj_next)
|
||||
end if
|
||||
ABI_FREE(cwaveprj_next)
|
||||
end if
|
||||
|
||||
ABI_NVTX_END_RANGE()
|
||||
|
||||
end subroutine getBm1X
|
||||
|
|
|
@ -122,7 +122,7 @@ subroutine lobpcgwf2(cg,dtset,eig,occ,enl_out,gs_hamk,isppol,ikpt,inonsc,istep,k
|
|||
|
||||
! Important things for NC
|
||||
integer,parameter :: choice=1, paw_opt=0, signs=1
|
||||
type(pawcprj_type) :: cprj_dum(gs_hamk%natom,1)
|
||||
type(pawcprj_type) :: cprj_dum(1,1)
|
||||
integer :: iband, shift
|
||||
real(dp) :: gsc_dummy(0,0)
|
||||
real(dp), allocatable :: l_gvnlxc(:,:)
|
||||
|
@ -355,7 +355,7 @@ end subroutine lobpcgwf2
|
|||
type(xgBlock_t), intent(inout) :: BX
|
||||
integer :: blockdim
|
||||
integer :: spacedim
|
||||
type(pawcprj_type) :: cprj_dum(l_gs_hamk%natom,1)
|
||||
type(pawcprj_type) :: cprj_dum(1,1)
|
||||
double precision :: dum
|
||||
double precision, parameter :: inv_sqrt2 = 1/sqrt2
|
||||
double precision, pointer :: cg(:,:)
|
||||
|
|
|
@ -1031,15 +1031,6 @@ subroutine vtorho(afford,atindx,atindx1,cg,compch_fft,cprj,cpus,dbl_nnsclo,&
|
|||
gs_hamk%ph3d_k,gs_hamk%kpt_k,gs_hamk%kg_k,gs_hamk%kpg_k, &
|
||||
compute_grad_atom=(optforces>0))
|
||||
end if
|
||||
else
|
||||
ABI_ERROR("istwfk > 2 is not handled with OpenMP GPU offload mode !")
|
||||
end if
|
||||
call make_gemm_nonlop_ompgpu(my_ikpt,gs_hamk%npw_fft_k,gs_hamk%lmnmax, &
|
||||
gs_hamk%ntypat, gs_hamk%indlmn, gs_hamk%nattyp, gs_hamk%istwf_k, &
|
||||
gs_hamk%ucvol, gs_hamk%ffnl_k, &
|
||||
gs_hamk%ph3d_k,gs_hamk%kpt_k,gs_hamk%kg_k,gs_hamk%kpg_k, &
|
||||
compute_grad_atom=(optforces>0))
|
||||
end if
|
||||
end if
|
||||
end if
|
||||
|
||||
|
|
|
@ -596,7 +596,7 @@ subroutine outscfcv(atindx1,cg,compch_fft,compch_sph,cprj,dimcprj,dmatpawu,dtfil
|
|||
|
||||
! Output of the GSR file (except when we are inside mover)
|
||||
#ifdef HAVE_NETCDF
|
||||
#if 0
|
||||
#ifndef FC_CRAY
|
||||
if (me == master .and. dtset%prtgsr == 1 .and. dtset%usewvl == 0) then
|
||||
!.and. (dtset%ionmov /= 0 .or. dtset%optcell /= 0)) then
|
||||
fname = strcat(dtfil%filnam_ds(4), "_GSR.nc")
|
||||
|
|
|
@ -456,9 +456,6 @@ subroutine gstate(args_gs,acell,codvsn,cpui,dtfil,dtset,iexit,initialized,&
|
|||
else if(dtset%gpu_option == ABI_GPU_DISABLED) then
|
||||
call init_gemm_nonlop(dtset%nkpt)
|
||||
end if
|
||||
else if(dtset%gpu_option == ABI_GPU_DISABLED) then
|
||||
call init_gemm_nonlop(dtset%nkpt)
|
||||
end if
|
||||
end if
|
||||
|
||||
gemm_nonlop_is_distributed = .false.
|
||||
|
@ -1812,8 +1809,6 @@ subroutine gstate(args_gs,acell,codvsn,cpui,dtfil,dtset,iexit,initialized,&
|
|||
call destroy_gemm_nonlop(dtset%nkpt)
|
||||
else if(dtset%gpu_option==ABI_GPU_DISABLED) then
|
||||
call destroy_gemm_nonlop(dtset%nkpt)
|
||||
else if(dtset%gpu_option==ABI_GPU_DISABLED) then
|
||||
call destroy_gemm_nonlop(dtset%nkpt)
|
||||
end if
|
||||
gemm_nonlop_use_gemm = .false.
|
||||
end if
|
||||
|
|
|
@ -68,8 +68,9 @@
|
|||
|
||||
#include "abi_common.h"
|
||||
|
||||
! nvtx related macro definition
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
#include "nvtx_macros.h"
|
||||
#endif
|
||||
|
||||
program abinit
|
||||
|
||||
|
@ -385,7 +386,7 @@ program abinit
|
|||
end if
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_GPU_MARKERS
|
||||
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||
NVTX_INIT(use_nvtx)
|
||||
#endif
|
||||
|
||||
|
|
Loading…
Reference in New Issue