mirror of https://github.com/abinit/abinit.git
Solve merge conflicts
This commit is contained in:
parent
420ed8ab0f
commit
bd2354497d
|
@ -366,6 +366,9 @@
|
||||||
/* Define to 1 if you want to activate support for OpenMP GPU offload. */
|
/* Define to 1 if you want to activate support for OpenMP GPU offload. */
|
||||||
#cmakedefine HAVE_OPENMP_OFFLOAD @HAVE_OPENMP_OFFLOAD@
|
#cmakedefine HAVE_OPENMP_OFFLOAD @HAVE_OPENMP_OFFLOAD@
|
||||||
|
|
||||||
|
/* Define to 1 if you want to activate support for OpenMP GPU offload. */
|
||||||
|
#cmakedefine HAVE_OPENMP_OFFLOAD @HAVE_OPENMP_OFFLOAD_DATASTRUCTURE@
|
||||||
|
|
||||||
/* Set to 1 if OpenMP has a working implementation of COLLAPSE. */
|
/* Set to 1 if OpenMP has a working implementation of COLLAPSE. */
|
||||||
#cmakedefine HAVE_OMP_COLLAPSE @HAVE_OMP_COLLAPSE@
|
#cmakedefine HAVE_OMP_COLLAPSE @HAVE_OMP_COLLAPSE@
|
||||||
|
|
||||||
|
|
|
@ -485,6 +485,8 @@ inline const char* _ConvertSMVer2ArchName(int major, int minor) {
|
||||||
{0x75, "Turing"},
|
{0x75, "Turing"},
|
||||||
{0x80, "Ampere"},
|
{0x80, "Ampere"},
|
||||||
{0x86, "Ampere"},
|
{0x86, "Ampere"},
|
||||||
|
{0x89, "AdaLovelace"},
|
||||||
|
{0x90, "Hopper"},
|
||||||
{-1, "Graphics Device"}};
|
{-1, "Graphics Device"}};
|
||||||
|
|
||||||
int index = 0;
|
int index = 0;
|
||||||
|
|
|
@ -499,6 +499,8 @@ inline const char* _ConvertSMVer2ArchName(int major, int minor) {
|
||||||
{0x75, "Turing"},
|
{0x75, "Turing"},
|
||||||
{0x80, "Ampere"},
|
{0x80, "Ampere"},
|
||||||
{0x86, "Ampere"},
|
{0x86, "Ampere"},
|
||||||
|
{0x89, "AdaLovelace"},
|
||||||
|
{0x90, "Hopper"},
|
||||||
{-1, "Graphics Device"}};
|
{-1, "Graphics Device"}};
|
||||||
|
|
||||||
int index = 0;
|
int index = 0;
|
||||||
|
|
|
@ -418,39 +418,24 @@ end subroutine Get_Mem_Dev
|
||||||
#if defined HAVE_GPU
|
#if defined HAVE_GPU
|
||||||
|
|
||||||
! Closing YAKL and Kokkos if opened
|
! Closing YAKL and Kokkos if opened
|
||||||
if (gpu_option==ABI_GPU_KOKKOS .or. gpu_option==ABI_GPU_LEGACY) then
|
if (gpu_option==ABI_GPU_KOKKOS) then
|
||||||
#ifdef HAVE_YAKL
|
#ifdef HAVE_YAKL
|
||||||
call gator_finalize()
|
call gator_finalize()
|
||||||
write(std_out,*)'yakl gator finalized'
|
write(std_out,*)'yakl gator finalized'
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef HAVE_KOKKOS
|
#ifdef HAVE_KOKKOS
|
||||||
! finalize kokkos
|
! finalize kokkos
|
||||||
call kokkos_finalize()
|
call kokkos_finalize()
|
||||||
write(std_out,*)'kokkos finalized'
|
write(std_out,*)'kokkos finalized'
|
||||||
#endif
|
#endif
|
||||||
|
!kokkos_finalize already reset GPU context
|
||||||
|
!if (gpu_option/=ABI_GPU_KOKKOS) call unset_dev()
|
||||||
end if
|
end if
|
||||||
|
|
||||||
! kokkos_finalize already reset GPU context
|
if (gpu_option==ABI_GPU_LEGACY) then
|
||||||
!if (gpu_option/=ABI_GPU_KOKKOS) call unset_dev()
|
call unset_dev()
|
||||||
|
|
||||||
! Closing YAKL and Kokkos if opened
|
|
||||||
if (gpu_option==ABI_GPU_KOKKOS .or. gpu_option==ABI_GPU_LEGACY) then
|
|
||||||
#ifdef HAVE_YAKL
|
|
||||||
call gator_finalize()
|
|
||||||
write(std_out,*)'yakl gator finalized'
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef HAVE_KOKKOS
|
|
||||||
! finalize kokkos
|
|
||||||
call kokkos_finalize()
|
|
||||||
write(std_out,*)'kokkos finalized'
|
|
||||||
#endif
|
|
||||||
end if
|
end if
|
||||||
|
|
||||||
! kokkos_finalize already reset GPU context
|
|
||||||
!if (gpu_option/=ABI_GPU_KOKKOS) call unset_dev()
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
end subroutine unsetdevice_cuda
|
end subroutine unsetdevice_cuda
|
||||||
!!***
|
!!***
|
||||||
|
|
|
@ -583,7 +583,7 @@ subroutine nctk_test_mpiio(print_warning)
|
||||||
|
|
||||||
!FIXME nf90create fails when using NVHPC
|
!FIXME nf90create fails when using NVHPC
|
||||||
! This might be due to my environment, maybe not, need to investigate this...
|
! This might be due to my environment, maybe not, need to investigate this...
|
||||||
#ifndef FC_NVHPC
|
!!#ifndef FC_NVHPC
|
||||||
#ifdef HAVE_NETCDF_MPI
|
#ifdef HAVE_NETCDF_MPI
|
||||||
if (xmpi_comm_rank(xmpi_world) == master) then
|
if (xmpi_comm_rank(xmpi_world) == master) then
|
||||||
! Try to open a file with hdf5.
|
! Try to open a file with hdf5.
|
||||||
|
@ -619,7 +619,7 @@ subroutine nctk_test_mpiio(print_warning)
|
||||||
ABI_WARNING(msg)
|
ABI_WARNING(msg)
|
||||||
end if
|
end if
|
||||||
#endif
|
#endif
|
||||||
#endif
|
!!#endif
|
||||||
|
|
||||||
#ifdef HAVE_NETCDF_DEFAULT
|
#ifdef HAVE_NETCDF_DEFAULT
|
||||||
if (.not. nctk_has_mpiio) then
|
if (.not. nctk_has_mpiio) then
|
||||||
|
|
|
@ -5002,7 +5002,7 @@ end subroutine abi_gpu_xcopy_2z
|
||||||
!! b
|
!! b
|
||||||
!!
|
!!
|
||||||
!! SIDE EFFECTS
|
!! SIDE EFFECTS
|
||||||
!! WARNING! : this routine is a dummy one when HAVE_GPU_CUDA is not enabled
|
!! WARNING! : this routine is a dummy one when HAVE_GPU is not enabled
|
||||||
!! the correct one is in 17_toolbox/gpu_linalg.cu
|
!! the correct one is in 17_toolbox/gpu_linalg.cu
|
||||||
!!
|
!!
|
||||||
!! SOURCE
|
!! SOURCE
|
||||||
|
|
|
@ -204,11 +204,6 @@ end subroutine elpa_func_uninit
|
||||||
!! INPUTS
|
!! INPUTS
|
||||||
!! [blacs_ctx]= -- optional -- Blacs context
|
!! [blacs_ctx]= -- optional -- Blacs context
|
||||||
!! [gpu]= -- optional -- Flag (0 or 1): use GPU version (currently only NVidia)
|
!! [gpu]= -- optional -- Flag (0 or 1): use GPU version (currently only NVidia)
|
||||||
!! na=Order of matrix A
|
|
||||||
!! nblk=Blocksize of cyclic distribution, must be the same in both directions!
|
|
||||||
!! local_nrows=Leading dimension of A
|
|
||||||
!! local_ncols=Local columns of matrixes A and Q (eigenvectors)
|
|
||||||
!! nev=Number of eigenvalues needed.
|
|
||||||
!!
|
!!
|
||||||
!! SIDE EFFECTS
|
!! SIDE EFFECTS
|
||||||
!! elpa_hdl(type<elpa_hdl_t>)= ELPA handle
|
!! elpa_hdl(type<elpa_hdl_t>)= ELPA handle
|
||||||
|
@ -223,6 +218,7 @@ subroutine elpa_func_allocate(elpa_hdl,gpu,blacs_ctx)
|
||||||
|
|
||||||
!Local variables-------------------------------
|
!Local variables-------------------------------
|
||||||
integer :: err,l_gpu,l_blacs_ctx
|
integer :: err,l_gpu,l_blacs_ctx
|
||||||
|
logical :: gpu_debug_mode=.false.
|
||||||
character(len=10) :: varname
|
character(len=10) :: varname
|
||||||
|
|
||||||
! *********************************************************************
|
! *********************************************************************
|
||||||
|
@ -261,49 +257,30 @@ subroutine elpa_func_allocate(elpa_hdl,gpu,blacs_ctx)
|
||||||
ABI_ERROR("You seem to use an old version of ELPA ( < 2021.x ) which only supports NVIDIA GPUs.")
|
ABI_ERROR("You seem to use an old version of ELPA ( < 2021.x ) which only supports NVIDIA GPUs.")
|
||||||
#endif
|
#endif
|
||||||
end if
|
end if
|
||||||
|
|
||||||
call elpa_func_error_handler(err_code=err,err_msg='Error when enabling GPU on ELPA')
|
|
||||||
if (err==ELPA_OK) call elpa_hdl%elpa%set("debug",1,err)
|
|
||||||
call elpa_func_error_handler(err_code=err,err_msg='Error when enabling debug on ELPA')
|
|
||||||
end if
|
|
||||||
#else
|
|
||||||
if (err==0.and.l_gpu==1) elpa_hdl%gpu=l_gpu
|
|
||||||
#endif
|
|
||||||
|
|
||||||
call elpa_func_error_handler(err_code=err,err_varname=varname)
|
|
||||||
|
|
||||||
call elpa_func_error_handler(err_code=err,err_msg='Error when enabling GPU on ELPA')
|
call elpa_func_error_handler(err_code=err,err_msg='Error when enabling GPU on ELPA')
|
||||||
|
|
||||||
!if (err==ELPA_OK) call elpa_hdl%elpa%set("debug",1,err)
|
if (gpu_debug_mode) then
|
||||||
!call elpa_func_error_handler(err_code=err,err_msg='Error when enabling debug on ELPA')
|
if (err==ELPA_OK) call elpa_hdl%elpa%set("debug",1,err)
|
||||||
|
call elpa_func_error_handler(err_code=err,err_msg='Error when enabling debug on ELPA')
|
||||||
|
end if
|
||||||
|
|
||||||
end if
|
end if
|
||||||
#else
|
#else
|
||||||
if (err==0.and.l_gpu==1) elpa_hdl%gpu=l_gpu
|
if (err==0.and.l_gpu==1) then
|
||||||
|
elpa_hdl%gpu=l_gpu
|
||||||
|
if (gpu_debug_mode) elpa_hdl%debug=1
|
||||||
|
end if
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
call elpa_func_error_handler(err_code=err,err_varname=varname)
|
|
||||||
|
|
||||||
if (present(blacs_ctx)) then
|
if (present(blacs_ctx)) then
|
||||||
if (err==ELPA_OK) call elpa_hdl%elpa%set("blacs_context",int(blacs_ctx,kind=c_int),err)
|
if (err==ELPA_OK) call elpa_hdl%elpa%set("blacs_context",int(blacs_ctx,kind=c_int),err)
|
||||||
|
call elpa_func_error_handler(err_code=err,err_varname=varname)
|
||||||
end if
|
end if
|
||||||
|
|
||||||
elpa_hdl%is_allocated=.true.
|
|
||||||
|
|
||||||
! Setting matrix size
|
|
||||||
call elpa_func_set_matrix(elpa_hdl,na,nblk,local_nrows,local_ncols,nev)
|
|
||||||
|
|
||||||
if (present(blacs_ctx)) then
|
|
||||||
if (err==ELPA_OK) call elpa_hdl%elpa%set("blacs_context",int(blacs_ctx,kind=c_int),err)
|
|
||||||
end if
|
|
||||||
|
|
||||||
! Proper ELPA setup
|
|
||||||
err = elpa_hdl%elpa%setup()
|
|
||||||
call elpa_func_error_handler(err_code=err,err_msg='Error during ELPA setup')
|
|
||||||
|
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxEndRange()
|
call nvtxEndRange()
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
end subroutine elpa_func_allocate
|
end subroutine elpa_func_allocate
|
||||||
!!***
|
!!***
|
||||||
|
|
||||||
|
@ -453,6 +430,12 @@ subroutine elpa_func_get_communicators(elpa_hdl,mpi_comm_parent,process_row,proc
|
||||||
varname='process_col'
|
varname='process_col'
|
||||||
call elpa_hdl%elpa%set(trim(varname),process_col,err)
|
call elpa_hdl%elpa%set(trim(varname),process_col,err)
|
||||||
end if
|
end if
|
||||||
|
if (err==ELPA_OK) then
|
||||||
|
varname=''
|
||||||
|
err = elpa_hdl%elpa%setup()
|
||||||
|
call elpa_func_error_handler(err_code=err,err_msg='Error during ELPA setup')
|
||||||
|
endif
|
||||||
|
|
||||||
#else
|
#else
|
||||||
elpa_hdl%mpi_comm_parent=mpi_comm_parent
|
elpa_hdl%mpi_comm_parent=mpi_comm_parent
|
||||||
elpa_hdl%process_row=process_row
|
elpa_hdl%process_row=process_row
|
||||||
|
@ -467,10 +450,13 @@ subroutine elpa_func_get_communicators(elpa_hdl,mpi_comm_parent,process_row,proc
|
||||||
!ELPA-LEGACY-2017
|
!ELPA-LEGACY-2017
|
||||||
err=elpa_get_communicators(mpi_comm_parent,process_row,process_col,elpa_hdl%elpa_comm_rows,elpa_hdl%elpa_comm_cols)
|
err=elpa_get_communicators(mpi_comm_parent,process_row,process_col,elpa_hdl%elpa_comm_rows,elpa_hdl%elpa_comm_cols)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
call elpa_func_error_handler(err_code=err,err_msg='Error in elpa_get_communicators',err_varname=varname)
|
call elpa_func_error_handler(err_code=err,err_msg='Error in elpa_get_communicators',err_varname=varname)
|
||||||
|
|
||||||
|
elpa_hdl%is_allocated=.true.
|
||||||
|
|
||||||
end subroutine elpa_func_get_communicators
|
end subroutine elpa_func_get_communicators
|
||||||
!!***
|
!!***
|
||||||
|
|
||||||
|
|
|
@ -2687,7 +2687,7 @@ subroutine compute_eigen_problem(processor, matrix, results, eigen, comm, istwf_
|
||||||
|
|
||||||
call elpa_func_allocate(elpa_hdl,gpu=use_gpu_elpa_)
|
call elpa_func_allocate(elpa_hdl,gpu=use_gpu_elpa_)
|
||||||
call elpa_func_set_matrix(elpa_hdl,matrix%sizeb_global(1),matrix%sizeb_blocs(1),nev__,&
|
call elpa_func_set_matrix(elpa_hdl,matrix%sizeb_global(1),matrix%sizeb_blocs(1),nev__,&
|
||||||
& matrix%sizeb_local(1),matrix%sizeb_local(2),nev__,gpu=use_gpu)
|
& matrix%sizeb_local(1),matrix%sizeb_local(2))
|
||||||
call elpa_func_get_communicators(elpa_hdl,processor%comm,processor%coords(1),processor%coords(2))
|
call elpa_func_get_communicators(elpa_hdl,processor%comm,processor%coords(1),processor%coords(2))
|
||||||
|
|
||||||
if (istwf_k/=2) then
|
if (istwf_k/=2) then
|
||||||
|
@ -2912,6 +2912,10 @@ subroutine solve_gevp_complex(na,nev,na_rows,na_cols,nblk,a,b,ev,z,tmp1,tmp2, &
|
||||||
if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa
|
if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||||
|
call nvtxStartRange("solve_gevp_complex",12)
|
||||||
|
#endif
|
||||||
|
|
||||||
! Allocate ELPA handle
|
! Allocate ELPA handle
|
||||||
call elpa_func_allocate(elpa_hdl,blacs_ctx=sc_desc(CTXT_),gpu=use_gpu_elpa_)
|
call elpa_func_allocate(elpa_hdl,blacs_ctx=sc_desc(CTXT_),gpu=use_gpu_elpa_)
|
||||||
call elpa_func_set_matrix(elpa_hdl,na,nblk,nev,na_rows,na_cols)
|
call elpa_func_set_matrix(elpa_hdl,na,nblk,nev,na_rows,na_cols)
|
||||||
|
@ -2920,6 +2924,7 @@ subroutine solve_gevp_complex(na,nev,na_rows,na_cols,nblk,a,b,ev,z,tmp1,tmp2, &
|
||||||
call elpa_func_solve_gevp_2stage(elpa_hdl,a,b,z,ev,nev)
|
call elpa_func_solve_gevp_2stage(elpa_hdl,a,b,z,ev,nev)
|
||||||
|
|
||||||
call elpa_func_deallocate(elpa_hdl)
|
call elpa_func_deallocate(elpa_hdl)
|
||||||
|
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxEndRange()
|
call nvtxEndRange()
|
||||||
#endif
|
#endif
|
||||||
|
@ -2957,6 +2962,10 @@ subroutine solve_gevp_real(na,nev,na_rows,na_cols,nblk,a,b,ev,z,tmp1,tmp2, &
|
||||||
if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa
|
if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||||
|
call nvtxStartRange("solve_gevp_real",12)
|
||||||
|
#endif
|
||||||
|
|
||||||
! Allocate ELPA handle
|
! Allocate ELPA handle
|
||||||
call elpa_func_allocate(elpa_hdl,blacs_ctx=sc_desc(CTXT_),gpu=use_gpu_elpa_)
|
call elpa_func_allocate(elpa_hdl,blacs_ctx=sc_desc(CTXT_),gpu=use_gpu_elpa_)
|
||||||
call elpa_func_set_matrix(elpa_hdl,na,nblk,nev,na_rows,na_cols)
|
call elpa_func_set_matrix(elpa_hdl,na,nblk,nev,na_rows,na_cols)
|
||||||
|
@ -2999,6 +3008,10 @@ subroutine solve_gevp_real(na,nev,na_rows,na_cols,nblk,a,b,ev,z,tmp1,tmp2, &
|
||||||
|
|
||||||
call elpa_func_deallocate(elpa_hdl)
|
call elpa_func_deallocate(elpa_hdl)
|
||||||
|
|
||||||
|
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||||
|
call nvtxEndRange()
|
||||||
|
#endif
|
||||||
|
|
||||||
end subroutine solve_gevp_real
|
end subroutine solve_gevp_real
|
||||||
!!***
|
!!***
|
||||||
#endif
|
#endif
|
||||||
|
@ -3050,6 +3063,7 @@ subroutine compute_generalized_eigen_problem(processor,matrix1,matrix2,results,e
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxStartRange("slk_compute_generalized_eigen", 10)
|
call nvtxStartRange("slk_compute_generalized_eigen", 10)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
nev__ = matrix1%sizeb_global(2); if (present(nev)) nev__ = nev
|
nev__ = matrix1%sizeb_global(2); if (present(nev)) nev__ = nev
|
||||||
use_gpu_elpa__ = 0
|
use_gpu_elpa__ = 0
|
||||||
#ifdef HAVE_LINALG_ELPA
|
#ifdef HAVE_LINALG_ELPA
|
||||||
|
@ -3082,6 +3096,7 @@ subroutine compute_generalized_eigen_problem(processor,matrix1,matrix2,results,e
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxEndRange()
|
call nvtxEndRange()
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#else
|
#else
|
||||||
!Arguments ------------------------------------
|
!Arguments ------------------------------------
|
||||||
class(processor_scalapack),intent(in) :: processor
|
class(processor_scalapack),intent(in) :: processor
|
||||||
|
@ -3292,6 +3307,10 @@ subroutine compute_eigen1(comm,processor,cplex,nbli_global,nbco_global,matrix,ve
|
||||||
if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa
|
if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||||
|
call nvtxStartRange("slk_compute_eigen1", 7)
|
||||||
|
#endif
|
||||||
|
|
||||||
! ================================
|
! ================================
|
||||||
! INITIALISATION SCALAPACK MATRIX
|
! INITIALISATION SCALAPACK MATRIX
|
||||||
! ================================
|
! ================================
|
||||||
|
@ -3358,6 +3377,10 @@ subroutine compute_eigen1(comm,processor,cplex,nbli_global,nbco_global,matrix,ve
|
||||||
ABI_SFREE(z_tmp_evec)
|
ABI_SFREE(z_tmp_evec)
|
||||||
ABI_SFREE(r_tmp_evec)
|
ABI_SFREE(r_tmp_evec)
|
||||||
|
|
||||||
|
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||||
|
call nvtxEndRange()
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef HAVE_LINALG_ELPA
|
#ifndef HAVE_LINALG_ELPA
|
||||||
ABI_UNUSED(use_gpu_elpa)
|
ABI_UNUSED(use_gpu_elpa)
|
||||||
#endif
|
#endif
|
||||||
|
@ -3423,6 +3446,10 @@ subroutine compute_eigen2(comm,processor,cplex,nbli_global,nbco_global,matrix1,m
|
||||||
if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa
|
if (present(use_gpu_elpa)) use_gpu_elpa_=use_gpu_elpa
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||||
|
call nvtxStartRange("slk_compute_eigen2", 7)
|
||||||
|
#endif
|
||||||
|
|
||||||
! ================================
|
! ================================
|
||||||
! INITIALISATION SCALAPACK MATRIX
|
! INITIALISATION SCALAPACK MATRIX
|
||||||
! ================================
|
! ================================
|
||||||
|
@ -3495,13 +3522,14 @@ subroutine compute_eigen2(comm,processor,cplex,nbli_global,nbco_global,matrix1,m
|
||||||
call sca_matrix2%free()
|
call sca_matrix2%free()
|
||||||
call sca_matrix3%free()
|
call sca_matrix3%free()
|
||||||
|
|
||||||
|
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
||||||
|
call nvtxEndRange()
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef HAVE_LINALG_ELPA
|
#ifndef HAVE_LINALG_ELPA
|
||||||
ABI_UNUSED(use_gpu_elpa)
|
ABI_UNUSED(use_gpu_elpa)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
|
||||||
call nvtxEndRange()
|
|
||||||
#endif
|
|
||||||
end subroutine compute_eigen2
|
end subroutine compute_eigen2
|
||||||
!!***
|
!!***
|
||||||
|
|
||||||
|
|
|
@ -29,10 +29,14 @@ cpp_options:
|
||||||
- DEV_YP_DEBUG_PSP
|
- DEV_YP_DEBUG_PSP
|
||||||
- DEV_YP_VDWXC
|
- DEV_YP_VDWXC
|
||||||
- FC_ABSOFT
|
- FC_ABSOFT
|
||||||
|
- FC_CRAY
|
||||||
|
- FC_FLANG
|
||||||
- FC_GNU
|
- FC_GNU
|
||||||
- FC_IBM
|
- FC_IBM
|
||||||
- FC_INTEL
|
- FC_INTEL
|
||||||
|
- FC_LLVM
|
||||||
- FC_NAG
|
- FC_NAG
|
||||||
|
- FC_NVHPC
|
||||||
- FC_PGI
|
- FC_PGI
|
||||||
- FFT_PRECISION
|
- FFT_PRECISION
|
||||||
- GPU_FOUR_HEADER_H
|
- GPU_FOUR_HEADER_H
|
||||||
|
@ -88,6 +92,7 @@ cpp_options:
|
||||||
- HAVE_GPU_CUDA_DP
|
- HAVE_GPU_CUDA_DP
|
||||||
- HAVE_GPU_CUDA_SP
|
- HAVE_GPU_CUDA_SP
|
||||||
- HAVE_GPU_CUDA_TM
|
- HAVE_GPU_CUDA_TM
|
||||||
|
- HAVE_GPU_HIP
|
||||||
- HAVE_GPU_MPI
|
- HAVE_GPU_MPI
|
||||||
- HAVE_GPU_SERIAL
|
- HAVE_GPU_SERIAL
|
||||||
- HAVE_GW_DPC
|
- HAVE_GW_DPC
|
||||||
|
@ -154,6 +159,8 @@ cpp_options:
|
||||||
- HAVE_NUMPY
|
- HAVE_NUMPY
|
||||||
- HAVE_OMP_COLLAPSE
|
- HAVE_OMP_COLLAPSE
|
||||||
- HAVE_OPENMP
|
- HAVE_OPENMP
|
||||||
|
- HAVE_OPENMP_OFFLOAD
|
||||||
|
- HAVE_OPENMP_OFFLOAD_DATASTRUCTURE
|
||||||
- HAVE_OS_LINUX
|
- HAVE_OS_LINUX
|
||||||
- HAVE_OS_MACOSX
|
- HAVE_OS_MACOSX
|
||||||
- HAVE_OS_WINDOWS
|
- HAVE_OS_WINDOWS
|
||||||
|
|
|
@ -124,7 +124,6 @@ type, public :: dataset_type
|
||||||
integer :: diismemory
|
integer :: diismemory
|
||||||
integer :: dipdip = 1
|
integer :: dipdip = 1
|
||||||
integer :: dipquad = 1
|
integer :: dipquad = 1
|
||||||
integer :: distribute_gemm_nonlop = 0
|
|
||||||
integer :: dmatpuopt
|
integer :: dmatpuopt
|
||||||
integer :: dmatudiag
|
integer :: dmatudiag
|
||||||
integer :: dmft_dc
|
integer :: dmft_dc
|
||||||
|
@ -202,7 +201,6 @@ type, public :: dataset_type
|
||||||
integer :: ga_algor
|
integer :: ga_algor
|
||||||
integer :: ga_fitness
|
integer :: ga_fitness
|
||||||
integer :: ga_n_rules
|
integer :: ga_n_rules
|
||||||
integer :: gemm_nonlop_split_size = 1
|
|
||||||
integer :: getcell = 0
|
integer :: getcell = 0
|
||||||
integer :: getddb = 0
|
integer :: getddb = 0
|
||||||
integer :: getdvdb = 0
|
integer :: getdvdb = 0
|
||||||
|
@ -623,7 +621,6 @@ type, public :: dataset_type
|
||||||
integer :: tl_nprccg
|
integer :: tl_nprccg
|
||||||
!U
|
!U
|
||||||
integer :: ucrpa
|
integer :: ucrpa
|
||||||
integer :: use_gpu_openmp_threads
|
|
||||||
integer :: usedmatpu
|
integer :: usedmatpu
|
||||||
integer :: usedmft
|
integer :: usedmft
|
||||||
integer :: useexexch
|
integer :: useexexch
|
||||||
|
@ -1445,7 +1442,6 @@ type(dataset_type) function dtset_copy(dtin) result(dtout)
|
||||||
dtout%delayperm = dtin%delayperm
|
dtout%delayperm = dtin%delayperm
|
||||||
dtout%diismemory = dtin%diismemory
|
dtout%diismemory = dtin%diismemory
|
||||||
dtout%dipquad = dtin%dipquad
|
dtout%dipquad = dtin%dipquad
|
||||||
dtout%distribute_gemm_nonlop = dtin%distribute_gemm_nonlop
|
|
||||||
dtout%dmatpuopt = dtin%dmatpuopt
|
dtout%dmatpuopt = dtin%dmatpuopt
|
||||||
dtout%dmatudiag = dtin%dmatudiag
|
dtout%dmatudiag = dtin%dmatudiag
|
||||||
dtout%dmft_dc = dtin%dmft_dc
|
dtout%dmft_dc = dtin%dmft_dc
|
||||||
|
@ -1578,7 +1574,6 @@ type(dataset_type) function dtset_copy(dtin) result(dtout)
|
||||||
dtout%ga_algor = dtin%ga_algor
|
dtout%ga_algor = dtin%ga_algor
|
||||||
dtout%ga_fitness = dtin%ga_fitness
|
dtout%ga_fitness = dtin%ga_fitness
|
||||||
dtout%ga_n_rules = dtin%ga_n_rules
|
dtout%ga_n_rules = dtin%ga_n_rules
|
||||||
dtout%gemm_nonlop_split_size = dtin%gemm_nonlop_split_size
|
|
||||||
dtout%getbseig = dtin%getbseig
|
dtout%getbseig = dtin%getbseig
|
||||||
dtout%getbsreso = dtin%getbsreso
|
dtout%getbsreso = dtin%getbsreso
|
||||||
dtout%getbscoup = dtin%getbscoup
|
dtout%getbscoup = dtin%getbscoup
|
||||||
|
@ -1994,7 +1989,6 @@ type(dataset_type) function dtset_copy(dtin) result(dtout)
|
||||||
dtout%tim1rev = dtin%tim1rev
|
dtout%tim1rev = dtin%tim1rev
|
||||||
dtout%timopt = dtin%timopt
|
dtout%timopt = dtin%timopt
|
||||||
dtout%use_gemm_nonlop = dtin%use_gemm_nonlop
|
dtout%use_gemm_nonlop = dtin%use_gemm_nonlop
|
||||||
dtout%use_gpu_openmp_threads = dtin%use_gpu_openmp_threads
|
|
||||||
dtout%useextfpmd = dtin%useextfpmd
|
dtout%useextfpmd = dtin%useextfpmd
|
||||||
dtout%use_yaml = dtin%use_yaml ! This variable activates the Yaml output for testing purposes
|
dtout%use_yaml = dtin%use_yaml ! This variable activates the Yaml output for testing purposes
|
||||||
! It will be removed when Yaml output enters production.
|
! It will be removed when Yaml output enters production.
|
||||||
|
@ -3317,7 +3311,7 @@ subroutine chkvars(string)
|
||||||
list_vars=trim(list_vars)//' delayperm densfor_pred densty dfield'
|
list_vars=trim(list_vars)//' delayperm densfor_pred densty dfield'
|
||||||
list_vars=trim(list_vars)//' dfpt_sciss diecut diegap dielam dielng diemac'
|
list_vars=trim(list_vars)//' dfpt_sciss diecut diegap dielam dielng diemac'
|
||||||
list_vars=trim(list_vars)//' diemix diemixmag diismemory'
|
list_vars=trim(list_vars)//' diemix diemixmag diismemory'
|
||||||
list_vars=trim(list_vars)//' dilatmx dipdip dipquad dipdip_prt dipdip_range distribute_gemm_nonlop'
|
list_vars=trim(list_vars)//' dilatmx dipdip dipquad dipdip_prt dipdip_range'
|
||||||
list_vars=trim(list_vars)//' dmatpawu dmatpuopt dmatudiag'
|
list_vars=trim(list_vars)//' dmatpawu dmatpuopt dmatudiag'
|
||||||
list_vars=trim(list_vars)//' dmftbandi dmftbandf dmftctqmc_basis'
|
list_vars=trim(list_vars)//' dmftbandi dmftbandf dmftctqmc_basis'
|
||||||
list_vars=trim(list_vars)//' dmftctqmc_check dmftctqmc_correl dmftctqmc_gmove'
|
list_vars=trim(list_vars)//' dmftctqmc_check dmftctqmc_correl dmftctqmc_gmove'
|
||||||
|
@ -3365,7 +3359,7 @@ subroutine chkvars(string)
|
||||||
list_vars=trim(list_vars)//' f4of2_sla f6of2_sla'
|
list_vars=trim(list_vars)//' f4of2_sla f6of2_sla'
|
||||||
!G
|
!G
|
||||||
list_vars=trim(list_vars)//' ga_algor ga_fitness ga_n_rules ga_opt_percent ga_rules'
|
list_vars=trim(list_vars)//' ga_algor ga_fitness ga_n_rules ga_opt_percent ga_rules'
|
||||||
list_vars=trim(list_vars)//' gemm_nonlop_split_size genafm getbscoup getbseig getbsreso getcell'
|
list_vars=trim(list_vars)//' genafm getbscoup getbseig getbsreso getcell'
|
||||||
list_vars=trim(list_vars)//' getddb getddb_filepath getden_filepath getddk'
|
list_vars=trim(list_vars)//' getddb getddb_filepath getden_filepath getddk'
|
||||||
list_vars=trim(list_vars)//' getdelfd getdkdk getdkde getden getkden getdvdb getdvdb_filepath'
|
list_vars=trim(list_vars)//' getdelfd getdkdk getdkde getden getkden getdvdb getdvdb_filepath'
|
||||||
list_vars=trim(list_vars)//' getefmas getkerange_filepath getgam_eig2nkq'
|
list_vars=trim(list_vars)//' getefmas getkerange_filepath getgam_eig2nkq'
|
||||||
|
@ -3539,7 +3533,6 @@ subroutine chkvars(string)
|
||||||
list_vars=trim(list_vars)//' userra userrb userrc userrd userre'
|
list_vars=trim(list_vars)//' userra userrb userrc userrd userre'
|
||||||
list_vars=trim(list_vars)//' usewvl usexcnhat useylm use_gemm_nonlop'
|
list_vars=trim(list_vars)//' usewvl usexcnhat useylm use_gemm_nonlop'
|
||||||
list_vars=trim(list_vars)//' use_slk useextfpmd use_yaml'
|
list_vars=trim(list_vars)//' use_slk useextfpmd use_yaml'
|
||||||
list_vars=trim(list_vars)//' use_slk useextfpmd use_yaml'
|
|
||||||
list_vars=trim(list_vars)//' use_oldchi'
|
list_vars=trim(list_vars)//' use_oldchi'
|
||||||
!V
|
!V
|
||||||
list_vars=trim(list_vars)//' vaclst vacnum vacuum vacwidth vcutgeo'
|
list_vars=trim(list_vars)//' vaclst vacnum vacuum vacwidth vcutgeo'
|
||||||
|
|
|
@ -605,7 +605,7 @@ module m_xgTransposer
|
||||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if( xgTransposer%gou_option == ABI_GPU_KOKKOS) then
|
if( xgTransposer%gpu_option == ABI_GPU_KOKKOS) then
|
||||||
|
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_KOKKOS) && defined(HAVE_YAKL)
|
#if defined(HAVE_GPU_CUDA) && defined(HAVE_KOKKOS) && defined(HAVE_YAKL)
|
||||||
call timab(tim_all2allv,1,tsec)
|
call timab(tim_all2allv,1,tsec)
|
||||||
|
@ -621,7 +621,6 @@ module m_xgTransposer
|
||||||
recvbuf(:,:) = recvbuf_mpi(:,:)
|
recvbuf(:,:) = recvbuf_mpi(:,:)
|
||||||
|
|
||||||
ABI_FREE(recvbuf_mpi)
|
ABI_FREE(recvbuf_mpi)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
else
|
else
|
||||||
|
|
|
@ -205,13 +205,12 @@ subroutine ompgpu_fourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,ist
|
||||||
|
|
||||||
cfft_size = 2*n1*n2*n3*ndat
|
cfft_size = 2*n1*n2*n3*ndat
|
||||||
|
|
||||||
#ifdef HAVE_GPU_CUDA
|
#if defined HAVE_GPU_CUDA
|
||||||
byte_count=sizeof(work_gpu)
|
byte_count=sizeof(work_gpu)
|
||||||
!$OMP TARGET DATA USE_DEVICE_PTR(work_gpu)
|
!$OMP TARGET DATA USE_DEVICE_PTR(work_gpu)
|
||||||
call gpu_memset(c_loc(work_gpu), 0, byte_count)
|
call gpu_memset(c_loc(work_gpu), 0, byte_count)
|
||||||
!$OMP END TARGET DATA
|
!$OMP END TARGET DATA
|
||||||
#endif
|
#elif defined HAVE_GPU_HIP
|
||||||
#ifdef HAVE_GPU_HIP
|
|
||||||
!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(i1,i2,i3) MAP(to:work_gpu)
|
!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(i1,i2,i3) MAP(to:work_gpu)
|
||||||
do i3=1,n3*ndat
|
do i3=1,n3*ndat
|
||||||
do i2=1,n2
|
do i2=1,n2
|
||||||
|
@ -266,12 +265,11 @@ subroutine ompgpu_fourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,ist
|
||||||
i1=kg_kin(1,ipw); if(i1<0)i1=i1+n1;
|
i1=kg_kin(1,ipw); if(i1<0)i1=i1+n1;
|
||||||
i2=kg_kin(2,ipw); if(i2<0)i2=i2+n2;
|
i2=kg_kin(2,ipw); if(i2<0)i2=i2+n2;
|
||||||
i3=kg_kin(3,ipw); if(i3<0)i3=i3+n3;
|
i3=kg_kin(3,ipw); if(i3<0)i3=i3+n3;
|
||||||
#ifdef HAVE_GPU_CUDA
|
#if defined HAVE_GPU_CUDA
|
||||||
i1inv = modulo(shift_inv1 - i1, n1) + 1
|
i1inv = modulo(shift_inv1 - i1, n1) + 1
|
||||||
i2inv = modulo(shift_inv2 - i2, n2) + 1
|
i2inv = modulo(shift_inv2 - i2, n2) + 1
|
||||||
i3inv = modulo(shift_inv3 - i3, n3) + 1
|
i3inv = modulo(shift_inv3 - i3, n3) + 1
|
||||||
#endif
|
#elif defined HAVE_GPU_HIP
|
||||||
#ifdef HAVE_GPU_HIP
|
|
||||||
i1inv = (shift_inv1-i1) - ( ((shift_inv1-i1)/n1) * n1 ) + 1
|
i1inv = (shift_inv1-i1) - ( ((shift_inv1-i1)/n1) * n1 ) + 1
|
||||||
i2inv = (shift_inv2-i2) - ( ((shift_inv2-i2)/n2) * n2 ) + 1
|
i2inv = (shift_inv2-i2) - ( ((shift_inv2-i2)/n2) * n2 ) + 1
|
||||||
i3inv = (shift_inv3-i3) - ( ((shift_inv3-i3)/n3) * n3 ) + 1
|
i3inv = (shift_inv3-i3) - ( ((shift_inv3-i3)/n3) * n3 ) + 1
|
||||||
|
|
|
@ -1513,7 +1513,7 @@ subroutine timana(mpi_enreg,natom,nband,ndtset,nfft,nkpt,npwtot,nsppol,timopt)
|
||||||
|
|
||||||
percent_limit=0.5_dp
|
percent_limit=0.5_dp
|
||||||
if (timopt<0) percent_limit=0.0001_dp
|
if (timopt<0) percent_limit=0.0001_dp
|
||||||
if (timopt<0) percent_limit=tol12
|
!if (timopt<0) percent_limit=tol12
|
||||||
|
|
||||||
!In case there is parallelism, report times for node 0
|
!In case there is parallelism, report times for node 0
|
||||||
!if (me==0 .and. nproc>1) then
|
!if (me==0 .and. nproc>1) then
|
||||||
|
@ -1591,10 +1591,10 @@ subroutine timana(mpi_enreg,natom,nband,ndtset,nfft,nkpt,npwtot,nsppol,timopt)
|
||||||
end if
|
end if
|
||||||
|
|
||||||
!Now, gather all information
|
!Now, gather all information
|
||||||
!call xmpi_sum(times,spaceworld,ierr)
|
call xmpi_sum(times,spaceworld,ierr)
|
||||||
!call xmpi_sum(ncount,spaceworld,ierr)
|
call xmpi_sum(ncount,spaceworld,ierr)
|
||||||
!call xmpi_sum(ftimes,spaceworld,ierr)
|
call xmpi_sum(ftimes,spaceworld,ierr)
|
||||||
!call xmpi_sum(nflops,spaceworld,ierr)
|
call xmpi_sum(nflops,spaceworld,ierr)
|
||||||
|
|
||||||
if (me==0) then ! Only the world master writes
|
if (me==0) then ! Only the world master writes
|
||||||
|
|
||||||
|
|
|
@ -2295,7 +2295,6 @@ subroutine indefo(dtsets, ndtset_alloc, nprocs)
|
||||||
dtsets(idtset)%dielam=half
|
dtsets(idtset)%dielam=half
|
||||||
dtsets(idtset)%diismemory=8
|
dtsets(idtset)%diismemory=8
|
||||||
dtsets(idtset)%dilatmx=one
|
dtsets(idtset)%dilatmx=one
|
||||||
dtsets(idtset)%distribute_gemm_nonlop=0
|
|
||||||
dtsets(idtset)%dmatpuopt=2
|
dtsets(idtset)%dmatpuopt=2
|
||||||
if (size(dtsets(idtset)%dmatpawu,4)>0) dtsets(idtset)%dmatpawu=-10._dp
|
if (size(dtsets(idtset)%dmatpawu,4)>0) dtsets(idtset)%dmatpawu=-10._dp
|
||||||
dtsets(idtset)%dmatudiag=0
|
dtsets(idtset)%dmatudiag=0
|
||||||
|
|
|
@ -257,10 +257,10 @@ SUBROUTINE BathOperatoroffdiag_init(op, flavors, samples, beta, iTech,opt_nondia
|
||||||
FREEIF(op%F)
|
FREEIF(op%F)
|
||||||
MALLOC(op%F,(1:op%sizeHybrid+1,1:flavors,1:flavors))
|
MALLOC(op%F,(1:op%sizeHybrid+1,1:flavors,1:flavors))
|
||||||
DT_FREEIF(op%tails)
|
DT_FREEIF(op%tails)
|
||||||
DT_MALLOC(op%tails, (1:op%flavors))
|
DT_MALLOC(op%tails,(1:op%flavors))
|
||||||
op%tails=0
|
op%tails=0
|
||||||
DT_FREEIF(op%Fshift)
|
DT_FREEIF(op%Fshift)
|
||||||
DT_MALLOC(op%Fshift, (1:op%flavors+1))
|
DT_MALLOC(op%Fshift,(1:op%flavors+1))
|
||||||
op%Fshift=0
|
op%Fshift=0
|
||||||
|
|
||||||
CALL Vector_init(op%R,100*op%flavors)
|
CALL Vector_init(op%R,100*op%flavors)
|
||||||
|
|
|
@ -643,7 +643,7 @@ SUBROUTINE Ctqmc_allocateAll(this)
|
||||||
this%measDE = 0.d0
|
this%measDE = 0.d0
|
||||||
|
|
||||||
FREEIF(this%mu)
|
FREEIF(this%mu)
|
||||||
MALLOC(this%mu, (1:flavors) )
|
MALLOC(this%mu,(1:flavors) )
|
||||||
this%mu = 0.d0
|
this%mu = 0.d0
|
||||||
END SUBROUTINE Ctqmc_allocateAll
|
END SUBROUTINE Ctqmc_allocateAll
|
||||||
!!***
|
!!***
|
||||||
|
|
|
@ -665,7 +665,7 @@ SUBROUTINE Ctqmcoffdiag_allocateAll(op)
|
||||||
op%measDE = 0.d0
|
op%measDE = 0.d0
|
||||||
|
|
||||||
FREEIF(op%mu)
|
FREEIF(op%mu)
|
||||||
MALLOC(op%mu, (1:flavors) )
|
MALLOC(op%mu,(1:flavors) )
|
||||||
op%mu = 0.d0
|
op%mu = 0.d0
|
||||||
FREEIF(op%hybri_limit)
|
FREEIF(op%hybri_limit)
|
||||||
MALLOC(op%hybri_limit, (flavors,flavors) )
|
MALLOC(op%hybri_limit, (flavors,flavors) )
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
!! which leads to excellent CPU efficiency and OpenMP scalability.
|
!! which leads to excellent CPU efficiency and OpenMP scalability.
|
||||||
!!
|
!!
|
||||||
!! COPYRIGHT
|
!! COPYRIGHT
|
||||||
!! Copyright (C) 2014-2022 ABINIT group (AL)
|
!! Copyright (C) 2014-2022 ABINIT group (MS)
|
||||||
!! This file is distributed under the terms of the
|
!! This file is distributed under the terms of the
|
||||||
!! GNU General Public License, see ~abinit/COPYING
|
!! GNU General Public License, see ~abinit/COPYING
|
||||||
!! or http://www.gnu.org/copyleft/gpl.txt .
|
!! or http://www.gnu.org/copyleft/gpl.txt .
|
||||||
|
@ -360,7 +360,7 @@ contains
|
||||||
end if
|
end if
|
||||||
|
|
||||||
if(allocated(temp_realvec_r)) then
|
if(allocated(temp_realvec_r)) then
|
||||||
!$OMP TARGET EXIT DATA MAP(delete:temp_realvec_r,temp_realvec_i)
|
!$OMP TARGET EXIT DATA MAP(delete:kwa m_hamiltonian.F90,temp_realvec_i)
|
||||||
ABI_FREE(temp_realvec_r)
|
ABI_FREE(temp_realvec_r)
|
||||||
ABI_FREE(temp_realvec_i)
|
ABI_FREE(temp_realvec_i)
|
||||||
end if
|
end if
|
||||||
|
@ -800,7 +800,7 @@ contains
|
||||||
real(dp),intent(inout),target :: vectin(2,npwin*nspinor*ndat)
|
real(dp),intent(inout),target :: vectin(2,npwin*nspinor*ndat)
|
||||||
real(dp),intent(inout) :: enlout(nnlout*ndat)
|
real(dp),intent(inout) :: enlout(nnlout*ndat)
|
||||||
real(dp),intent(out),target :: svectout(:,:)
|
real(dp),intent(out),target :: svectout(:,:)
|
||||||
real(dp),intent(inout),target :: vectout(:,:) !vz_i
|
real(dp),intent(inout),target :: vectout(:,:)
|
||||||
real(dp),intent(inout),optional, ABI_CONTIGUOUS target :: vectproj(:,:,:)
|
real(dp),intent(inout),optional, ABI_CONTIGUOUS target :: vectproj(:,:,:)
|
||||||
type(pawcprj_type),intent(inout) :: cprjin(natom,nspinor*((cpopt+5)/5)*ndat)
|
type(pawcprj_type),intent(inout) :: cprjin(natom,nspinor*((cpopt+5)/5)*ndat)
|
||||||
|
|
||||||
|
@ -826,9 +826,7 @@ contains
|
||||||
character(len=500) :: msg
|
character(len=500) :: msg
|
||||||
integer(C_SIZE_T) :: byte_count
|
integer(C_SIZE_T) :: byte_count
|
||||||
#ifdef HAVE_GPU_HIP
|
#ifdef HAVE_GPU_HIP
|
||||||
type(c_ptr) :: vectin_amdcopy
|
type(c_ptr) :: vectin_amdcopy,vectout_amdcopy,svectout_amdcopy
|
||||||
type(c_ptr) :: vectout_amdcopy
|
|
||||||
type(c_ptr) :: svectout_amdcopy
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
! *************************************************************************
|
! *************************************************************************
|
||||||
|
|
|
@ -191,7 +191,6 @@ module m_hamiltonian
|
||||||
! Governs the choice of the GPU implementation:
|
! Governs the choice of the GPU implementation:
|
||||||
! = 0 ==> do not use GPU
|
! = 0 ==> do not use GPU
|
||||||
! > 0 ==> see defs_basis.F90 to have the list of possible GPU implementations
|
! > 0 ==> see defs_basis.F90 to have the list of possible GPU implementations
|
||||||
! = 666 ==> use openMP GPU implementation of hamiltonian operators
|
|
||||||
|
|
||||||
integer :: usecprj
|
integer :: usecprj
|
||||||
! usecprj= 1 if cprj projected WF are stored in memory
|
! usecprj= 1 if cprj projected WF are stored in memory
|
||||||
|
|
|
@ -759,6 +759,7 @@ has_fock=.false.
|
||||||
#ifndef HAVE_GPU_HIP
|
#ifndef HAVE_GPU_HIP
|
||||||
!$OMP TARGET EXIT DATA MAP(delete:work)
|
!$OMP TARGET EXIT DATA MAP(delete:work)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
end if ! type_calc
|
end if ! type_calc
|
||||||
ABI_NVTX_END_RANGE()
|
ABI_NVTX_END_RANGE()
|
||||||
|
|
||||||
|
@ -768,6 +769,7 @@ has_fock=.false.
|
||||||
!============================================================
|
!============================================================
|
||||||
! Application of the non-local potential and the Fock potential
|
! Application of the non-local potential and the Fock potential
|
||||||
!============================================================
|
!============================================================
|
||||||
|
|
||||||
ABI_NVTX_START_RANGE(NVTX_GETGHC_NLOCPOT)
|
ABI_NVTX_START_RANGE(NVTX_GETGHC_NLOCPOT)
|
||||||
if (type_calc==0 .or. type_calc==2) then
|
if (type_calc==0 .or. type_calc==2) then
|
||||||
signs=2 ; choice=1 ; nnlout=1 ; idir=0 ; tim_nonlop=1
|
signs=2 ; choice=1 ; nnlout=1 ; idir=0 ; tim_nonlop=1
|
||||||
|
|
|
@ -1467,8 +1467,7 @@ subroutine solve_inner_ompgpu(invovl, ham, cplx, mpi_enreg, proj, ndat, sm1proj,
|
||||||
integer :: additional_steps_to_take,idat,iproj,icplx
|
integer :: additional_steps_to_take,idat,iproj,icplx
|
||||||
integer :: Ptsize(3)
|
integer :: Ptsize(3)
|
||||||
#ifdef HAVE_GPU_HIP
|
#ifdef HAVE_GPU_HIP
|
||||||
type(c_ptr) :: sm1proj_amdcopy
|
type(c_ptr) :: sm1proj_amdcopy,PtPsm1proj_amdcopy
|
||||||
type(c_ptr) :: PtPsm1proj_amdcopy
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
! *************************************************************************
|
! *************************************************************************
|
||||||
|
|
|
@ -53,7 +53,7 @@ module m_prep_kgb
|
||||||
use m_ompgpu_fourwf
|
use m_ompgpu_fourwf
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
use m_nvtx
|
use m_nvtx
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -295,7 +295,7 @@ subroutine prep_getghc(cwavef, gs_hamk, gvnlxc, gwavef, swavef, lambda, blocksiz
|
||||||
|
|
||||||
if(do_transpose) then
|
if(do_transpose) then
|
||||||
call timab(545,3,tsec)
|
call timab(545,3,tsec)
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||||
#endif
|
#endif
|
||||||
if ( ((.not.flag_inv_sym) .and. bandpp==1 .and. mpi_enreg%paral_spinor==0 .and. my_nspinor==2 ).or. &
|
if ( ((.not.flag_inv_sym) .and. bandpp==1 .and. mpi_enreg%paral_spinor==0 .and. my_nspinor==2 ).or. &
|
||||||
|
@ -306,7 +306,7 @@ subroutine prep_getghc(cwavef, gs_hamk, gvnlxc, gwavef, swavef, lambda, blocksiz
|
||||||
call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,&
|
call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,&
|
||||||
& recvcountsloc,rdisplsloc,spaceComm,ier)
|
& recvcountsloc,rdisplsloc,spaceComm,ier)
|
||||||
end if
|
end if
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxEndRange()
|
call nvtxEndRange()
|
||||||
#endif
|
#endif
|
||||||
call timab(545,2,tsec)
|
call timab(545,2,tsec)
|
||||||
|
@ -527,56 +527,56 @@ subroutine prep_getghc(cwavef, gs_hamk, gvnlxc, gwavef, swavef, lambda, blocksiz
|
||||||
if ( ((.not.flag_inv_sym) .and. bandpp==1 .and. mpi_enreg%paral_spinor==0 .and. my_nspinor==2 ).or. &
|
if ( ((.not.flag_inv_sym) .and. bandpp==1 .and. mpi_enreg%paral_spinor==0 .and. my_nspinor==2 ).or. &
|
||||||
& ((.not.flag_inv_sym) .and. bandpp>1) .or. flag_inv_sym ) then
|
& ((.not.flag_inv_sym) .and. bandpp>1) .or. flag_inv_sym ) then
|
||||||
if (sij_opt==1) then
|
if (sij_opt==1) then
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||||
#endif
|
#endif
|
||||||
call xmpi_alltoallv(swavef_alltoall1,recvcountsloc,rdisplsloc,swavef,&
|
call xmpi_alltoallv(swavef_alltoall1,recvcountsloc,rdisplsloc,swavef,&
|
||||||
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxEndRange()
|
call nvtxEndRange()
|
||||||
#endif
|
#endif
|
||||||
end if
|
end if
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||||
#endif
|
#endif
|
||||||
if (.not.local_gvnlxc) call xmpi_alltoallv(gvnlxc_alltoall1,recvcountsloc,rdisplsloc,gvnlxc,&
|
if (.not.local_gvnlxc) call xmpi_alltoallv(gvnlxc_alltoall1,recvcountsloc,rdisplsloc,gvnlxc,&
|
||||||
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxEndRange()
|
call nvtxEndRange()
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||||
#endif
|
#endif
|
||||||
call xmpi_alltoallv(gwavef_alltoall1,recvcountsloc,rdisplsloc,gwavef,&
|
call xmpi_alltoallv(gwavef_alltoall1,recvcountsloc,rdisplsloc,gwavef,&
|
||||||
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxEndRange()
|
call nvtxEndRange()
|
||||||
#endif
|
#endif
|
||||||
else
|
else
|
||||||
if (sij_opt==1) then
|
if (sij_opt==1) then
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||||
#endif
|
#endif
|
||||||
call xmpi_alltoallv(swavef_alltoall2,recvcountsloc,rdisplsloc,swavef,&
|
call xmpi_alltoallv(swavef_alltoall2,recvcountsloc,rdisplsloc,swavef,&
|
||||||
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxEndRange()
|
call nvtxEndRange()
|
||||||
#endif
|
#endif
|
||||||
end if
|
end if
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||||
#endif
|
#endif
|
||||||
if (.not.local_gvnlxc) call xmpi_alltoallv(gvnlxc_alltoall2,recvcountsloc,rdisplsloc,gvnlxc,&
|
if (.not.local_gvnlxc) call xmpi_alltoallv(gvnlxc_alltoall2,recvcountsloc,rdisplsloc,gvnlxc,&
|
||||||
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxEndRange()
|
call nvtxEndRange()
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||||
#endif
|
#endif
|
||||||
call xmpi_alltoallv(gwavef_alltoall2,recvcountsloc,rdisplsloc,gwavef,&
|
call xmpi_alltoallv(gwavef_alltoall2,recvcountsloc,rdisplsloc,gwavef,&
|
||||||
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxEndRange()
|
call nvtxEndRange()
|
||||||
#endif
|
#endif
|
||||||
end if
|
end if
|
||||||
|
@ -860,7 +860,7 @@ subroutine prep_nonlop(choice,cpopt,cwaveprj,enlout_block,hamk,idir,lambdablock,
|
||||||
|
|
||||||
if(do_transpose) then
|
if(do_transpose) then
|
||||||
call timab(581,1,tsec)
|
call timab(581,1,tsec)
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||||
#endif
|
#endif
|
||||||
if (bandpp/=1 .or. (bandpp==1 .and. mpi_enreg%paral_spinor==0.and.nspinortot==2)) then
|
if (bandpp/=1 .or. (bandpp==1 .and. mpi_enreg%paral_spinor==0.and.nspinortot==2)) then
|
||||||
|
@ -892,7 +892,7 @@ subroutine prep_nonlop(choice,cpopt,cwaveprj,enlout_block,hamk,idir,lambdablock,
|
||||||
call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,&
|
call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,&
|
||||||
& recvcountsloc,rdisplsloc,spaceComm,ier)
|
& recvcountsloc,rdisplsloc,spaceComm,ier)
|
||||||
end if
|
end if
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxEndRange()
|
call nvtxEndRange()
|
||||||
#endif
|
#endif
|
||||||
call timab(581,2,tsec)
|
call timab(581,2,tsec)
|
||||||
|
@ -1000,43 +1000,43 @@ subroutine prep_nonlop(choice,cpopt,cwaveprj,enlout_block,hamk,idir,lambdablock,
|
||||||
call timab(581,1,tsec)
|
call timab(581,1,tsec)
|
||||||
if(bandpp/=1 .or. (bandpp==1 .and. mpi_enreg%paral_spinor==0.and.nspinortot==2))then
|
if(bandpp/=1 .or. (bandpp==1 .and. mpi_enreg%paral_spinor==0.and.nspinortot==2))then
|
||||||
if (paw_opt/=3) then
|
if (paw_opt/=3) then
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||||
#endif
|
#endif
|
||||||
call xmpi_alltoallv(gvnlc_alltoall1,recvcountsloc,rdisplsloc,gvnlc,&
|
call xmpi_alltoallv(gvnlc_alltoall1,recvcountsloc,rdisplsloc,gvnlc,&
|
||||||
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxEndRange()
|
call nvtxEndRange()
|
||||||
#endif
|
#endif
|
||||||
end if
|
end if
|
||||||
if (paw_opt==3.or.paw_opt==4) then
|
if (paw_opt==3.or.paw_opt==4) then
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||||
#endif
|
#endif
|
||||||
call xmpi_alltoallv(gsc_alltoall1,recvcountsloc,rdisplsloc,gsc,&
|
call xmpi_alltoallv(gsc_alltoall1,recvcountsloc,rdisplsloc,gsc,&
|
||||||
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxEndRange()
|
call nvtxEndRange()
|
||||||
#endif
|
#endif
|
||||||
end if
|
end if
|
||||||
else
|
else
|
||||||
if (paw_opt/=3) then
|
if (paw_opt/=3) then
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||||
#endif
|
#endif
|
||||||
call xmpi_alltoallv(gvnlc_alltoall2,recvcountsloc,rdisplsloc,gvnlc,&
|
call xmpi_alltoallv(gvnlc_alltoall2,recvcountsloc,rdisplsloc,gvnlc,&
|
||||||
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxEndRange()
|
call nvtxEndRange()
|
||||||
#endif
|
#endif
|
||||||
end if
|
end if
|
||||||
if (paw_opt==3.or.paw_opt==4) then
|
if (paw_opt==3.or.paw_opt==4) then
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||||
#endif
|
#endif
|
||||||
call xmpi_alltoallv(gsc_alltoall2,recvcountsloc,rdisplsloc,gsc,&
|
call xmpi_alltoallv(gsc_alltoall2,recvcountsloc,rdisplsloc,gsc,&
|
||||||
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
& sendcountsloc,sdisplsloc,spaceComm,ier)
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxEndRange()
|
call nvtxEndRange()
|
||||||
#endif
|
#endif
|
||||||
end if
|
end if
|
||||||
|
@ -1282,7 +1282,7 @@ subroutine prep_fourwf(rhoaug,blocksize,cwavef,wfraug,iblock,istwf_k,mgfft,&
|
||||||
sdisplsloc(:)=sdispls(:)*2
|
sdisplsloc(:)=sdispls(:)*2
|
||||||
|
|
||||||
call timab(547,1,tsec)
|
call timab(547,1,tsec)
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxStartRange("MPI_AllToAllV", 8)
|
call nvtxStartRange("MPI_AllToAllV", 8)
|
||||||
#endif
|
#endif
|
||||||
#if defined HAVE_GPU && defined HAVE_YAKL
|
#if defined HAVE_GPU && defined HAVE_YAKL
|
||||||
|
@ -1305,7 +1305,7 @@ subroutine prep_fourwf(rhoaug,blocksize,cwavef,wfraug,iblock,istwf_k,mgfft,&
|
||||||
call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,&
|
call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,&
|
||||||
& recvcountsloc,rdisplsloc,spaceComm,ier)
|
& recvcountsloc,rdisplsloc,spaceComm,ier)
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_MARKERS)
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
call nvtxEndRange()
|
call nvtxEndRange()
|
||||||
#endif
|
#endif
|
||||||
call timab(547,2,tsec)
|
call timab(547,2,tsec)
|
||||||
|
|
|
@ -975,13 +975,6 @@ subroutine forstrnps(cg,cprj,ecut,ecutsm,effmass_free,eigen,electronpositron,foc
|
||||||
gs_hamk%ucvol, gs_hamk%ffnl_k, gs_hamk%ph3d_k, gs_hamk%kpt_k, &
|
gs_hamk%ucvol, gs_hamk%ffnl_k, gs_hamk%ph3d_k, gs_hamk%kpt_k, &
|
||||||
gs_hamk%kg_k, gs_hamk%kpg_k, &
|
gs_hamk%kg_k, gs_hamk%kpg_k, &
|
||||||
compute_grad_strain=(stress_needed>0),compute_grad_atom=(optfor>0))
|
compute_grad_strain=(stress_needed>0),compute_grad_atom=(optfor>0))
|
||||||
!!FIXME signs==1 not handled in CUDA GEMM nonlop
|
|
||||||
!else if ( gpu_option /= ABI_GPU_LEGACY) then
|
|
||||||
! call make_gemm_nonlop_ompgpu(my_ikpt,gs_hamk%npw_fft_k,gs_hamk%lmnmax, &
|
|
||||||
! gs_hamk%ntypat, gs_hamk%indlmn, gs_hamk%nattyp, gs_hamk%istwf_k, &
|
|
||||||
! gs_hamk%ucvol, gs_hamk%ffnl_k, gs_hamk%ph3d_k, gs_hamk%kpt_k, &
|
|
||||||
! gs_hamk%kg_k, gs_hamk%kpg_k, &
|
|
||||||
! compute_grad_strain=(stress_needed>0),compute_grad_atom=(optfor>0))
|
|
||||||
else if ( gpu_option == ABI_GPU_OPENMP) then
|
else if ( gpu_option == ABI_GPU_OPENMP) then
|
||||||
call make_gemm_nonlop_ompgpu(my_ikpt,gs_hamk%npw_fft_k,gs_hamk%lmnmax, &
|
call make_gemm_nonlop_ompgpu(my_ikpt,gs_hamk%npw_fft_k,gs_hamk%lmnmax, &
|
||||||
gs_hamk%ntypat, gs_hamk%indlmn, gs_hamk%nattyp, gs_hamk%istwf_k, &
|
gs_hamk%ntypat, gs_hamk%indlmn, gs_hamk%nattyp, gs_hamk%istwf_k, &
|
||||||
|
@ -989,15 +982,6 @@ subroutine forstrnps(cg,cprj,ecut,ecutsm,effmass_free,eigen,electronpositron,foc
|
||||||
gs_hamk%kg_k, gs_hamk%kpg_k, &
|
gs_hamk%kg_k, gs_hamk%kpg_k, &
|
||||||
compute_grad_strain=(stress_needed>0),compute_grad_atom=(optfor>0))
|
compute_grad_strain=(stress_needed>0),compute_grad_atom=(optfor>0))
|
||||||
end if
|
end if
|
||||||
else
|
|
||||||
ABI_ERROR("istwfk > 2 is not handled with OpenMP GPU offload mode !")
|
|
||||||
end if
|
|
||||||
call make_gemm_nonlop_ompgpu(my_ikpt,gs_hamk%npw_fft_k,gs_hamk%lmnmax, &
|
|
||||||
gs_hamk%ntypat, gs_hamk%indlmn, gs_hamk%nattyp, gs_hamk%istwf_k, &
|
|
||||||
gs_hamk%ucvol, gs_hamk%ffnl_k, gs_hamk%ph3d_k, gs_hamk%kpt_k, &
|
|
||||||
gs_hamk%kg_k, gs_hamk%kpg_k, &
|
|
||||||
compute_grad_strain=(stress_needed>0),compute_grad_atom=(optfor>0))
|
|
||||||
end if
|
|
||||||
end if
|
end if
|
||||||
|
|
||||||
! Loop over (blocks of) bands; accumulate forces and/or stresses
|
! Loop over (blocks of) bands; accumulate forces and/or stresses
|
||||||
|
|
|
@ -180,6 +180,10 @@ subroutine mkrho(cg,dtset,gprimd,irrzon,kg,mcg,mpi_enreg,npwarr,occ,paw_dmft,phn
|
||||||
!arrays
|
!arrays
|
||||||
integer,allocatable :: gbound(:,:)
|
integer,allocatable :: gbound(:,:)
|
||||||
logical :: locc_test,nspinor1TreatedByThisProc,nspinor2TreatedByThisProc
|
logical :: locc_test,nspinor1TreatedByThisProc,nspinor2TreatedByThisProc
|
||||||
|
real(dp),allocatable :: occ_diag(:),cwavef_rot(:,:,:,:)
|
||||||
|
#if defined HAVE_GPUL
|
||||||
|
real(dp),allocatable :: weight_t(:) ! only allocated and used when use_gpu_cuda = 1
|
||||||
|
#endif
|
||||||
#if defined HAVE_GPU && defined HAVE_YAKL
|
#if defined HAVE_GPU && defined HAVE_YAKL
|
||||||
integer(int32),ABI_CONTIGUOUS pointer :: kg_k(:,:) => null()
|
integer(int32),ABI_CONTIGUOUS pointer :: kg_k(:,:) => null()
|
||||||
real(real64) :: dummy(2,1) = reshape( (/0.0, 0.0/), shape(dummy))
|
real(real64) :: dummy(2,1) = reshape( (/0.0, 0.0/), shape(dummy))
|
||||||
|
|
|
@ -3718,7 +3718,7 @@ subroutine wfd_change_ngfft(Wfd, Cryst, Psps, new_ngfft)
|
||||||
|
|
||||||
! Recalculate FFT tables.
|
! Recalculate FFT tables.
|
||||||
! Calculate the FFT index of $ R^{-1} (r-\tau) $ used to symmetrize u_Rk.
|
! Calculate the FFT index of $ R^{-1} (r-\tau) $ used to symmetrize u_Rk.
|
||||||
ABI_REMALLOC(Wfd%irottb, (Wfd%nfftot,Cryst%nsym) )
|
ABI_REMALLOC(Wfd%irottb, (Wfd%nfftot,Cryst%nsym))
|
||||||
call rotate_FFT_mesh(Cryst%nsym,Cryst%symrel,Cryst%tnons,Wfd%ngfft,Wfd%irottb,iscompatibleFFT)
|
call rotate_FFT_mesh(Cryst%nsym,Cryst%symrel,Cryst%tnons,Wfd%ngfft,Wfd%irottb,iscompatibleFFT)
|
||||||
|
|
||||||
if (.not. iscompatibleFFT) then
|
if (.not. iscompatibleFFT) then
|
||||||
|
|
|
@ -6036,7 +6036,7 @@ subroutine ddb_to_dtset(comm, dtset, filename, psps)
|
||||||
ABI_REMALLOC(dtset%spinat, (3,dtset%natom))
|
ABI_REMALLOC(dtset%spinat, (3,dtset%natom))
|
||||||
dtset%spinat(:,:) = ddb_hdr%spinat(1:3,1:ddb_hdr%matom)
|
dtset%spinat(:,:) = ddb_hdr%spinat(1:3,1:ddb_hdr%matom)
|
||||||
|
|
||||||
ABI_REMALLOC(dtset%xred_orig, (3,dtset%natom,mxnimage) )
|
ABI_REMALLOC(dtset%xred_orig, (3,dtset%natom,mxnimage))
|
||||||
dtset%xred_orig(:,:,1) = ddb_hdr%xred(1:3,1:ddb_hdr%matom)
|
dtset%xred_orig(:,:,1) = ddb_hdr%xred(1:3,1:ddb_hdr%matom)
|
||||||
|
|
||||||
ABI_REMALLOC(dtset%ziontypat, (dtset%ntypat))
|
ABI_REMALLOC(dtset%ziontypat, (dtset%ntypat))
|
||||||
|
|
|
@ -278,12 +278,6 @@ subroutine chebfiwf2(cg,dtset,eig,enl_out,gs_hamk,kinpw,mpi_enreg,&
|
||||||
type(pawcprj_type) :: cprj_dum(gs_hamk%natom,1)
|
type(pawcprj_type) :: cprj_dum(gs_hamk%natom,1)
|
||||||
|
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL)
|
#if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL)
|
||||||
! other
|
|
||||||
integer(kind=c_size_t) :: l_pcon_size_bytes
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL)
|
|
||||||
! other
|
|
||||||
integer(kind=c_size_t) :: l_pcon_size_bytes
|
integer(kind=c_size_t) :: l_pcon_size_bytes
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -581,7 +575,6 @@ subroutine getghc_gsc1(X,AX,BX,transposer)
|
||||||
! ABI_MALLOC(l_gvnlxc,(2,blockdim*spacedim))
|
! ABI_MALLOC(l_gvnlxc,(2,blockdim*spacedim))
|
||||||
!end if
|
!end if
|
||||||
|
|
||||||
|
|
||||||
call multithreaded_getghc(l_cpopt,cg,cprj_dum,ghc,gsc,&
|
call multithreaded_getghc(l_cpopt,cg,cprj_dum,ghc,gsc,&
|
||||||
l_gs_hamk,l_gvnlxc,eval,l_mpi_enreg,blockdim,l_prtvol,l_sij_opt,l_tim_getghc,0)
|
l_gs_hamk,l_gvnlxc,eval,l_mpi_enreg,blockdim,l_prtvol,l_sij_opt,l_tim_getghc,0)
|
||||||
|
|
||||||
|
@ -590,14 +583,6 @@ subroutine getghc_gsc1(X,AX,BX,transposer)
|
||||||
call gpu_device_synchronize()
|
call gpu_device_synchronize()
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL)
|
|
||||||
!if (chebfi%gpu_option==ABI_GPU_KOKKOS) then
|
|
||||||
call gpu_device_synchronize()
|
|
||||||
!end if
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
!Scale cg, ghc, gsc
|
!Scale cg, ghc, gsc
|
||||||
if ( l_istwf == 2 ) then
|
if ( l_istwf == 2 ) then
|
||||||
call xgBlock_scale(X ,sqrt2,1,gpu_option=l_gs_hamk%gpu_option)
|
call xgBlock_scale(X ,sqrt2,1,gpu_option=l_gs_hamk%gpu_option)
|
||||||
|
@ -783,7 +768,6 @@ subroutine getBm1X(X,Bm1X,transposer)
|
||||||
!cwaveprj_next is dummy
|
!cwaveprj_next is dummy
|
||||||
if(gemm_nonlop_use_gemm) then
|
if(gemm_nonlop_use_gemm) then
|
||||||
ABI_MALLOC(cwaveprj_next, (1,1))
|
ABI_MALLOC(cwaveprj_next, (1,1))
|
||||||
else
|
|
||||||
else
|
else
|
||||||
ABI_MALLOC(cwaveprj_next, (l_gs_hamk%natom,l_nspinor*blockdim))
|
ABI_MALLOC(cwaveprj_next, (l_gs_hamk%natom,l_nspinor*blockdim))
|
||||||
call pawcprj_alloc(cwaveprj_next,0,l_gs_hamk%dimcprj)
|
call pawcprj_alloc(cwaveprj_next,0,l_gs_hamk%dimcprj)
|
||||||
|
@ -859,11 +843,11 @@ subroutine getBm1X(X,Bm1X,transposer)
|
||||||
end if
|
end if
|
||||||
end if
|
end if
|
||||||
end if
|
end if
|
||||||
|
|
||||||
if (l_paw) then
|
if (l_paw) then
|
||||||
if (l_useria /= 121212) then
|
ABI_FREE(cwaveprj_next)
|
||||||
ABI_FREE(cwaveprj_next)
|
|
||||||
end if
|
|
||||||
end if
|
end if
|
||||||
|
|
||||||
ABI_NVTX_END_RANGE()
|
ABI_NVTX_END_RANGE()
|
||||||
|
|
||||||
end subroutine getBm1X
|
end subroutine getBm1X
|
||||||
|
|
|
@ -122,7 +122,7 @@ subroutine lobpcgwf2(cg,dtset,eig,occ,enl_out,gs_hamk,isppol,ikpt,inonsc,istep,k
|
||||||
|
|
||||||
! Important things for NC
|
! Important things for NC
|
||||||
integer,parameter :: choice=1, paw_opt=0, signs=1
|
integer,parameter :: choice=1, paw_opt=0, signs=1
|
||||||
type(pawcprj_type) :: cprj_dum(gs_hamk%natom,1)
|
type(pawcprj_type) :: cprj_dum(1,1)
|
||||||
integer :: iband, shift
|
integer :: iband, shift
|
||||||
real(dp) :: gsc_dummy(0,0)
|
real(dp) :: gsc_dummy(0,0)
|
||||||
real(dp), allocatable :: l_gvnlxc(:,:)
|
real(dp), allocatable :: l_gvnlxc(:,:)
|
||||||
|
@ -355,7 +355,7 @@ end subroutine lobpcgwf2
|
||||||
type(xgBlock_t), intent(inout) :: BX
|
type(xgBlock_t), intent(inout) :: BX
|
||||||
integer :: blockdim
|
integer :: blockdim
|
||||||
integer :: spacedim
|
integer :: spacedim
|
||||||
type(pawcprj_type) :: cprj_dum(l_gs_hamk%natom,1)
|
type(pawcprj_type) :: cprj_dum(1,1)
|
||||||
double precision :: dum
|
double precision :: dum
|
||||||
double precision, parameter :: inv_sqrt2 = 1/sqrt2
|
double precision, parameter :: inv_sqrt2 = 1/sqrt2
|
||||||
double precision, pointer :: cg(:,:)
|
double precision, pointer :: cg(:,:)
|
||||||
|
|
|
@ -1031,15 +1031,6 @@ subroutine vtorho(afford,atindx,atindx1,cg,compch_fft,cprj,cpus,dbl_nnsclo,&
|
||||||
gs_hamk%ph3d_k,gs_hamk%kpt_k,gs_hamk%kg_k,gs_hamk%kpg_k, &
|
gs_hamk%ph3d_k,gs_hamk%kpt_k,gs_hamk%kg_k,gs_hamk%kpg_k, &
|
||||||
compute_grad_atom=(optforces>0))
|
compute_grad_atom=(optforces>0))
|
||||||
end if
|
end if
|
||||||
else
|
|
||||||
ABI_ERROR("istwfk > 2 is not handled with OpenMP GPU offload mode !")
|
|
||||||
end if
|
|
||||||
call make_gemm_nonlop_ompgpu(my_ikpt,gs_hamk%npw_fft_k,gs_hamk%lmnmax, &
|
|
||||||
gs_hamk%ntypat, gs_hamk%indlmn, gs_hamk%nattyp, gs_hamk%istwf_k, &
|
|
||||||
gs_hamk%ucvol, gs_hamk%ffnl_k, &
|
|
||||||
gs_hamk%ph3d_k,gs_hamk%kpt_k,gs_hamk%kg_k,gs_hamk%kpg_k, &
|
|
||||||
compute_grad_atom=(optforces>0))
|
|
||||||
end if
|
|
||||||
end if
|
end if
|
||||||
end if
|
end if
|
||||||
|
|
||||||
|
|
|
@ -596,7 +596,7 @@ subroutine outscfcv(atindx1,cg,compch_fft,compch_sph,cprj,dimcprj,dmatpawu,dtfil
|
||||||
|
|
||||||
! Output of the GSR file (except when we are inside mover)
|
! Output of the GSR file (except when we are inside mover)
|
||||||
#ifdef HAVE_NETCDF
|
#ifdef HAVE_NETCDF
|
||||||
#if 0
|
#ifndef FC_CRAY
|
||||||
if (me == master .and. dtset%prtgsr == 1 .and. dtset%usewvl == 0) then
|
if (me == master .and. dtset%prtgsr == 1 .and. dtset%usewvl == 0) then
|
||||||
!.and. (dtset%ionmov /= 0 .or. dtset%optcell /= 0)) then
|
!.and. (dtset%ionmov /= 0 .or. dtset%optcell /= 0)) then
|
||||||
fname = strcat(dtfil%filnam_ds(4), "_GSR.nc")
|
fname = strcat(dtfil%filnam_ds(4), "_GSR.nc")
|
||||||
|
|
|
@ -456,9 +456,6 @@ subroutine gstate(args_gs,acell,codvsn,cpui,dtfil,dtset,iexit,initialized,&
|
||||||
else if(dtset%gpu_option == ABI_GPU_DISABLED) then
|
else if(dtset%gpu_option == ABI_GPU_DISABLED) then
|
||||||
call init_gemm_nonlop(dtset%nkpt)
|
call init_gemm_nonlop(dtset%nkpt)
|
||||||
end if
|
end if
|
||||||
else if(dtset%gpu_option == ABI_GPU_DISABLED) then
|
|
||||||
call init_gemm_nonlop(dtset%nkpt)
|
|
||||||
end if
|
|
||||||
end if
|
end if
|
||||||
|
|
||||||
gemm_nonlop_is_distributed = .false.
|
gemm_nonlop_is_distributed = .false.
|
||||||
|
@ -1812,8 +1809,6 @@ subroutine gstate(args_gs,acell,codvsn,cpui,dtfil,dtset,iexit,initialized,&
|
||||||
call destroy_gemm_nonlop(dtset%nkpt)
|
call destroy_gemm_nonlop(dtset%nkpt)
|
||||||
else if(dtset%gpu_option==ABI_GPU_DISABLED) then
|
else if(dtset%gpu_option==ABI_GPU_DISABLED) then
|
||||||
call destroy_gemm_nonlop(dtset%nkpt)
|
call destroy_gemm_nonlop(dtset%nkpt)
|
||||||
else if(dtset%gpu_option==ABI_GPU_DISABLED) then
|
|
||||||
call destroy_gemm_nonlop(dtset%nkpt)
|
|
||||||
end if
|
end if
|
||||||
gemm_nonlop_use_gemm = .false.
|
gemm_nonlop_use_gemm = .false.
|
||||||
end if
|
end if
|
||||||
|
|
|
@ -68,8 +68,9 @@
|
||||||
|
|
||||||
#include "abi_common.h"
|
#include "abi_common.h"
|
||||||
|
|
||||||
! nvtx related macro definition
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
#include "nvtx_macros.h"
|
#include "nvtx_macros.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
program abinit
|
program abinit
|
||||||
|
|
||||||
|
@ -385,7 +386,7 @@ program abinit
|
||||||
end if
|
end if
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef HAVE_GPU_MARKERS
|
#if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
|
||||||
NVTX_INIT(use_nvtx)
|
NVTX_INIT(use_nvtx)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue