diff --git a/Doc/user_guide.tex b/Doc/user_guide.tex index e28ac836a..4aece316e 100644 --- a/Doc/user_guide.tex +++ b/Doc/user_guide.tex @@ -428,12 +428,11 @@ formerly PGI compiler, freely available for download. As a rule, \qe\ tries to keep compatibility with older compilers, avoiding nonstandard extensions and newer features that are not -widespread or stabilized. If however your compiler is older say -than $\sim 5$ years or so it is quite likely that something will -not work. The same applies to mathematical and MPI libraries. -For GPU compilation, get the most recent NVidia HPC SDK you can: -while compilers from v. 17.4 on should work, several problems and -limitations are known to exist for old compiler versions. +widespread or stabilized. If however your compiler is older than +a few ($\sim 5$) years, it is likely that something will not work. +The same applies to mathematical and MPI libraries. +For GPU compilation, you need v.19.10 or later of the NVidia HPC SDK +(previous versions are no longer supported). Big computing centers typically provide a Fortran compiler complete with all needed libraries. Workstations or ``commodity'' machines @@ -621,10 +620,11 @@ and the following optional packages:\\ \end{tabular}\\ \\ In order to compile the code for GPU's you will need a recent version --- the more recent, the better -- of the NVidia HPC software development -kit (SDK). OpenMP must be enabled, and you may want to use a CUDA-aware MPI -distribution if running on multiple GPUs in order to optimize the -interprocess data transfer. The following \configure\ options are +(v.19.10 or later: the more recent, the better) of the NVidia HPC software +development kit (SDK). OpenMP should be enabled. Enabling faster communications +between GPUs, via NVlink or Infiniband RDMA, is essential for optimal +performance. If your MPI library is built to be CUDA-aware, then enable it +with \texttt{--with-cuda-mpi=yes}. The following \configure\ options are available:\\ \begin{tabular}{ll} \texttt{--with-cuda=value}& enable compilation of GPU-accelerated subroutines.\\ @@ -640,10 +640,7 @@ available:\\ & \texttt{value} must be consistent with the\\ & CUDA Toolkit installed on the workstation \\ & or available on the compute nodes of the HPC facility.\\ -\texttt{--enable-cuda-env-check=[yes]}& if set, sanity checks on the CUDA environment\\ - & are performed (default: no). -\end{tabular}\\ - +\texttt{--with-cuda-mpi=value} & enable usage of a CUDA-aware MPI library (default: no).\\ To modify or extend \configure, see the Wiki pages on GitLab: \texttt{https://gitlab.com/QEF/q-e/-/wikis}. @@ -662,7 +659,7 @@ libraries (e.g. you need to add \texttt{-D\_\_FFTW} to \texttt{DFLAGS} if you want to link internal FFTW). For a correct choice of preprocessing flags, refer to the documentation in \texttt{include/defs.h.README}. -Even if \configure\ works, yuo may need to tweak the \texttt{make.inc} +Even if \configure\ works, you may need to tweak the \texttt{make.inc} file. It is very simple, but please note that if you change any settings (e.g. preprocessing, compilation flags) after a previous, successful or failed, compilation, you must run diff --git a/PW/src/setup.f90 b/PW/src/setup.f90 index 3754568db..95a4e6c78 100644 --- a/PW/src/setup.f90 +++ b/PW/src/setup.f90 @@ -808,35 +808,15 @@ END SUBROUTINE setup_para ! !---------------------------------------------------------------------------- LOGICAL FUNCTION check_gpu_support( ) - ! - ! FIXME: seems useless. If one has GPUs, one wants to run on GPUs. - ! + ! Minimal case: returns true if compiled for GPUs IMPLICIT NONE ! - LOGICAL, SAVE :: first = .TRUE. - LOGICAL, SAVE :: saved_value = .FALSE. - CHARACTER(len=255) :: gpu_env - INTEGER :: vlen, istat - #if defined(__CUDA) - IF( .NOT. first ) THEN - check_gpu_support = saved_value - RETURN - END IF - first = .FALSE. - ! - CALL get_environment_variable("USEGPU", gpu_env, vlen, istat, .true.) - IF (istat == 0) THEN - check_gpu_support = (gpu_env /= "no") - ELSE - check_gpu_support = .TRUE. - END IF - saved_value = check_gpu_support - ! + check_gpu_support = .TRUE. #else check_gpu_support = .FALSE. #endif - RETURN + ! END FUNCTION check_gpu_support ! !---------------------------------------------------------------------------- diff --git a/README_GPU.md b/README_GPU.md index 909d25a76..27e00f43c 100644 --- a/README_GPU.md +++ b/README_GPU.md @@ -9,9 +9,9 @@ Installation ============ This version requires the nvfortran (previously PGI) compiler from the -freely available NVidia HPC SDK. You are advised to use the most recent -version of NVidia software you can find. While any version later than 17.4 -should work, many glitches are known to exist in older versions. +NVidia HPC SDK, v.19.10 or later (freely downloadable from NVidia). +Earlier versions may or may not work and are no longer supported. +You are advised to use the most recent version of NVidia software you can find. The `configure` script checks for the presence of the nvfortran compiler and of a few cuda libraries. For this reason the path pointing to the cuda toolkit must be present in `LD_LIBRARY_PATH`. @@ -19,7 +19,7 @@ must be present in `LD_LIBRARY_PATH`. A template for the configure command is: ``` -./configure --with-cuda=XX --with-cuda-runtime=YY --with-cuda-cc=ZZ --enable-openmp [--enable-openacc] [ --with-scalapack=no ] +./configure --with-cuda=XX --with-cuda-runtime=YY --with-cuda-cc=ZZ --enable-openmp [ --with-scalapack=no ][ --with-cuda-mpi=yes ] ``` where `XX` is the location of the CUDA Toolkit (in HPC environments is @@ -32,14 +32,21 @@ CUDA Driver Version: 11000 Default Target: cc70 ... ``` -The version is returned as (1000 major + 10 minor). For example, CUDA 9.2 -would be represented by 9020. For the above case, configure QE with: +The version is returned as (1000 major + 10 minor). For example, CUDA 11.0 +is represented by 11000. For the above case, configure QE with: ``` ./configure --with-cuda=$CUDA_HOME --with-cuda-cc=70 --with-cuda-runtime=11.0 ``` Alternatively, you may use the (deprecated) tool `get_device_props.py` in directory `dev-tools/`. +Enabling faster communications between GPUs, via NVlink or Infiniband RDMA, +is essential for optimal performance. If your MPI library is built to be +CUDA-aware, then enable `--with-cuda-mpi=yes` (default: no). + +Serial (no MPI) compilation is also supported: use `--disable-parallel`. + +Option --with-openacc is no longer honored: OpenACC is always needed. It is generally a good idea to disable Scalapack when running small test cases since the serial GPU eigensolver outperforms the parallel CPU eigensolver in many circumstances. @@ -48,8 +55,6 @@ From time to time PGI links to the wrong CUDA libraries and fails reporting a problem in `cusolver` missing `GOmp` (GNU Openmp). This problem can be solved by removing the cuda toolkit from the `LD_LIBRARY_PATH` before compiling. -Serial compilation is also supported. - Execution ========= @@ -60,16 +65,5 @@ the beginning of the output GPU acceleration is ACTIVE. ``` -GPU acceleration can be switched off by setting the following environment -variable: - -``` -$ export USEGPU=no -``` - - -Testing -======= - The current GPU version passes all tests with both parallel and serial compilation. diff --git a/environment_variables b/environment_variables index 0ee61b87c..a4ded58ab 100644 --- a/environment_variables +++ b/environment_variables @@ -108,7 +108,7 @@ export OMP_NUM_THREADS=1 LC_ALL=C export LC_ALL -NETWORK_PSEUDO=http://www.quantum-espresso.org/wp-content/uploads/upf_files/ +NETWORK_PSEUDO=https://www.quantum-espresso.org/wp-content/uploads/upf_files/ # wget or curl needed if some PP has to be downloaded from web site # script wizard will surely find a better way to find what is available diff --git a/install/configure b/install/configure index 40952fe55..e675e542c 100755 --- a/install/configure +++ b/install/configure @@ -769,6 +769,7 @@ enable_static with_cuda with_cuda_cc with_cuda_runtime +with_cuda_mpi enable_openacc with_libxc with_libxc_prefix @@ -1438,6 +1439,7 @@ Optional Packages: --with-cuda-cc=VAL GPU architecture (Kepler: 35, Pascal: 60, Volta: 70) [default=35] --with-cuda-runtime=VAL CUDA runtime (Pascal: 8+, Volta: 9+) [default=10.1] + --with-cuda-mpi=VAL CUDA-aware MPI (yes|no) [default=no] --with-libxc (yes|no) Use libXC for some XC functionals (default: no) --with-libxc-prefix=DIR Directory where libxc was installed. @@ -4191,6 +4193,16 @@ else fi + +# Check whether --with-cuda-mpi was given. +if test "${with_cuda_mpi+set}" = set; then : + withval=$with_cuda_mpi; +else + with_cuda_mpi=no +fi + + + # Check whether --enable-openacc was given. if test "${enable_openacc+set}" = set; then : enableval=$enable_openacc; @@ -4316,6 +4328,9 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu # Headers and libraries # ----------------------------------------- try_dflags="$try_dflags -D__CUDA" + if test "$use_parallel" -eq 1 && test "$with_cuda_mpi" == "yes"; then + try_dflags="$try_dflags -D__GPU_MPI" + fi cuda_extlibs="devxlib" cuda_libs="$mMcudalib=cufft,cublas,cusolver,curand \$(TOPDIR)/external/devxlib/src/libdevXlib.a" @@ -4328,21 +4343,17 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu runtime_major_version=`echo $with_cuda_runtime | cut -d. -f1` runtime_minor_version=`echo $with_cuda_runtime | cut -d. -f2` if test "$runtime_major_version" -lt 10 || - ( "$runtime_major_version" -eq 10 && "$runtime_minor_version" -lt 1 ) + (test "$runtime_major_version" -eq 10 && test "$runtime_minor_version" -lt 1 ) then - # CUDA toolkit v < 10.1: new solver not available - cuda_fflags="$cuda_fflags \$(MOD_FLAG)\$(TOPDIR)/EIGENSOLVER_GPU/lib_eigsolve" - cuda_extlibs="$cuda_extlibs eigensolver" - cuda_libs="$cuda_libs \$(TOPDIR)/EIGENSOLVER_GPU/lib_eigsolve/lib_eigsolve.a" - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Using legacy custom solver." >&5 -$as_echo "$as_me: WARNING: Using legacy custom solver." >&2;} + # CUDA toolkit v < 10.1: cusolver not available + as_fn_error $? "Unsupported CUDA Toolkit, too old" "$LINENO" 5 else try_dflags="$try_dflags -D__USE_CUSOLVER" fi # ----------------------------------------- - # C flags - not sure whether they are suitable for old version as well + # C flags # ----------------------------------------- - cuda_cflags=" -I$with_cuda/include -gpu=cc$with_cuda_cc,cuda$with_cuda_runtime" + cuda_cflags=" -I$with_cuda/include $mMcuda=cc$with_cuda_cc,cuda$with_cuda_runtime" ldflags="$ldflags $mMcuda=cc$with_cuda_cc,cuda$with_cuda_runtime" gpu_arch="$with_cuda_cc" cuda_runtime="$with_cuda_runtime" @@ -4350,6 +4361,8 @@ $as_echo "$as_me: WARNING: Using legacy custom solver." >&2;} ldflags="$ldflags -acc" cuda_fflags="$cuda_fflags -acc" cuda_cflags="$cuda_cflags -acc" + else + as_fn_error $? "OpenACC must be enabled" "$LINENO" 5 fi fi diff --git a/install/m4/x_ac_qe_cuda.m4 b/install/m4/x_ac_qe_cuda.m4 index 73580bac9..9141ed897 100644 --- a/install/m4/x_ac_qe_cuda.m4 +++ b/install/m4/x_ac_qe_cuda.m4 @@ -48,6 +48,12 @@ AC_ARG_WITH([cuda-runtime], [AS_HELP_STRING([--with-cuda-runtime=VAL],[CUDA runtime (Pascal: 8+, Volta: 9+) @<:@default=10.1@:>@])], [], [with_cuda_runtime=10.1]) + +AC_ARG_WITH([cuda-mpi], + [AS_HELP_STRING([--with-cuda-mpi=VAL],[CUDA-aware MPI (yes|no) @<:@default=no@:>@])], + [], + [with_cuda_mpi=no]) + AC_ARG_ENABLE([openacc], [AS_HELP_STRING([--enable-openacc],[Enable compilation with OPENACC @<:@default=yes@:>@])], @@ -81,6 +87,9 @@ then # Headers and libraries # ----------------------------------------- try_dflags="$try_dflags -D__CUDA" + if test "$use_parallel" -eq 1 && test "$with_cuda_mpi" == "yes"; then + try_dflags="$try_dflags -D__GPU_MPI" + fi cuda_extlibs="devxlib" cuda_libs="$mMcudalib=cufft,cublas,cusolver,curand \$(TOPDIR)/external/devxlib/src/libdevXlib.a" @@ -93,20 +102,17 @@ then runtime_major_version=`echo $with_cuda_runtime | cut -d. -f1` runtime_minor_version=`echo $with_cuda_runtime | cut -d. -f2` if test "$runtime_major_version" -lt 10 || - ( "$runtime_major_version" -eq 10 && "$runtime_minor_version" -lt 1 ) + (test "$runtime_major_version" -eq 10 && test "$runtime_minor_version" -lt 1 ) then - # CUDA toolkit v < 10.1: new solver not available - cuda_fflags="$cuda_fflags \$(MOD_FLAG)\$(TOPDIR)/EIGENSOLVER_GPU/lib_eigsolve" - cuda_extlibs="$cuda_extlibs eigensolver" - cuda_libs="$cuda_libs \$(TOPDIR)/EIGENSOLVER_GPU/lib_eigsolve/lib_eigsolve.a" - AC_MSG_WARN([Using legacy custom solver.]) + # CUDA toolkit v < 10.1: cusolver not available + AC_MSG_ERROR([Unsupported CUDA Toolkit, too old]) else try_dflags="$try_dflags -D__USE_CUSOLVER" fi # ----------------------------------------- - # C flags - not sure whether they are suitable for old version as well + # C flags # ----------------------------------------- - cuda_cflags=" -I$with_cuda/include -gpu=cc$with_cuda_cc,cuda$with_cuda_runtime" + cuda_cflags=" -I$with_cuda/include $mMcuda=cc$with_cuda_cc,cuda$with_cuda_runtime" ldflags="$ldflags $mMcuda=cc$with_cuda_cc,cuda$with_cuda_runtime" gpu_arch="$with_cuda_cc" cuda_runtime="$with_cuda_runtime" @@ -114,6 +120,8 @@ then ldflags="$ldflags -acc" cuda_fflags="$cuda_fflags -acc" cuda_cflags="$cuda_cflags -acc" + else + AC_MSG_ERROR([OpenACC must be enabled]) fi fi