mirror of https://gitlab.com/QEF/q-e.git
Compile and run QE-GPU on CRAY XK7 using PGI compilers -- updates
git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@10459 c92efa57-630b-4861-b058-cf58834340f0
This commit is contained in:
parent
6d6e8ccaff
commit
c374d873d6
|
@ -0,0 +1,172 @@
|
|||
# make.sys. Generated from make.sys.in by configure.
|
||||
|
||||
# compilation rules
|
||||
|
||||
.SUFFIXES :
|
||||
.SUFFIXES : .o .c .f .f90 .cu
|
||||
|
||||
# most fortran compilers can directly preprocess c-like directives: use
|
||||
# $(MPIF90) $(F90FLAGS) -c $<
|
||||
# if explicit preprocessing by the C preprocessor is needed, use:
|
||||
# $(CPP) $(CPPFLAGS) $< -o $*.F90
|
||||
# $(MPIF90) $(F90FLAGS) -c $*.F90 -o $*.o
|
||||
# remember the tabulator in the first column !!!
|
||||
|
||||
.f90.o:
|
||||
$(MPIF90) $(F90FLAGS) -c $<
|
||||
|
||||
# .f.o and .c.o: do not modify
|
||||
|
||||
.f.o:
|
||||
$(F77) $(FFLAGS) -c $<
|
||||
|
||||
.c.o:
|
||||
$(CC) $(CFLAGS) -c $<
|
||||
|
||||
# CUDA files
|
||||
.cu.o:
|
||||
$(NVCC) $(NVCCFLAGS) -I../../include $(IFLAGS) $(DFLAGS) -c $<
|
||||
|
||||
# topdir for linking espresso libs with plugins
|
||||
TOPDIR = /ccs/home/spigafi/espresso/GPU/../
|
||||
|
||||
|
||||
# DFLAGS = precompilation options (possible arguments to -D and -U)
|
||||
# used by the C compiler and preprocessor
|
||||
# FDFLAGS = as DFLAGS, for the f90 compiler
|
||||
# See include/defs.h.README for a list of options and their meaning
|
||||
# With the exception of IBM xlf, FDFLAGS = $(DFLAGS)
|
||||
# For IBM xlf, FDFLAGS is the same as DFLAGS with separating commas
|
||||
|
||||
# MANUAL_DFLAGS = additional precompilation option(s), if desired
|
||||
# You may use this instead of tweaking DFLAGS and FDFLAGS
|
||||
# BEWARE: will not work for IBM xlf! Manually edit FDFLAGS
|
||||
MANUAL_DFLAGS =
|
||||
DFLAGS = -D__PGI -D__IOTK_SAFEST -D__FFTW -D__MPI -D__PARA -D__SCALAPACK -D__CUDA -D__OPENMP -D__ELPA -D__PHIGEMM $(MANUAL_DFLAGS)
|
||||
FDFLAGS = $(DFLAGS) $(MANUAL_DFLAGS)
|
||||
|
||||
# IFLAGS = how to locate directories where files to be included are
|
||||
# In most cases, IFLAGS = -I../include
|
||||
|
||||
IFLAGS = -I$(MPICH_DIR)/include -I../include -I/ccs/home/spigafi/espresso/GPU/..//phiGEMM/include $(CRAY_CUDATOOLKIT_INCLUDE_OPTS)
|
||||
|
||||
# MOD_FLAGS = flag used by f90 compiler to locate modules
|
||||
# Each Makefile defines the list of needed modules in MODFLAGS
|
||||
|
||||
MOD_FLAG = -I
|
||||
|
||||
# Compilers: fortran-90, fortran-77, C
|
||||
# If a parallel compilation is desired, MPIF90 should be a fortran-90
|
||||
# compiler that produces executables for parallel execution using MPI
|
||||
# (such as for instance mpif90, mpf90, mpxlf90,...);
|
||||
# otherwise, an ordinary fortran-90 compiler (f90, g95, xlf90, ifort,...)
|
||||
# If you have a parallel machine but no suitable candidate for MPIF90,
|
||||
# try to specify the directory containing "mpif.h" in IFLAGS
|
||||
# and to specify the location of MPI libraries in MPI_LIBS
|
||||
|
||||
MPIF90 = pgf90
|
||||
#F90 = pgf90
|
||||
CC = pgcc
|
||||
F77 = pgf77
|
||||
|
||||
# C preprocessor and preprocessing flags - for explicit preprocessing,
|
||||
# if needed (see the compilation rules above)
|
||||
# preprocessing flags must include DFLAGS and IFLAGS
|
||||
|
||||
CPP = cpp
|
||||
CPPFLAGS = -P -traditional $(DFLAGS) $(IFLAGS)
|
||||
|
||||
# compiler flags: C, F90, F77
|
||||
# C flags must include DFLAGS and IFLAGS
|
||||
# F90 flags must include MODFLAGS, IFLAGS, and FDFLAGS with appropriate syntax
|
||||
|
||||
CFLAGS = -O3 -D__align__\(n\)=__attribute__\(\(aligned\(n\)\)\) -D__location__\(a\)=__annotate__\(a\) -DCUDARTAPI= -D__x86_64 $(DFLAGS) $(IFLAGS)
|
||||
F90FLAGS = -O3 -Mcache_align -r8 -Mpreprocess -mp=nonuma $(FDFLAGS) $(IFLAGS) $(MODFLAGS)
|
||||
FFLAGS = -O3 -r8 -mp=nonuma
|
||||
|
||||
# compiler flags without optimization for fortran-77
|
||||
# the latter is NEEDED to properly compile dlamch.f, used by lapack
|
||||
|
||||
FFLAGS_NOOPT = -O0
|
||||
|
||||
# compiler flag needed by some compilers when the main is not fortran
|
||||
# Currently used for Yambo
|
||||
|
||||
FFLAGS_NOMAIN = -Mnomain
|
||||
|
||||
# Linker, linker-specific flags (if any)
|
||||
# Typically LD coincides with F90 or MPIF90, LD_LIBS is empty
|
||||
|
||||
LD = pgf90
|
||||
LDFLAGS = -v -mp=nonuma
|
||||
LD_LIBS = $(MPICH_DIR)/lib/libmpich_pgi.so /opt/cray/libsci/12.0.00/pgi/119/interlagos/lib/libsci_pgi_mp.a $(CRAY_CUDATOOLKIT_POST_LINK_OPTS) -lcublas -lcufft
|
||||
|
||||
# External Libraries (if any) : blas, lapack, fft, MPI
|
||||
|
||||
# If you have nothing better, use the local copy :
|
||||
# BLAS_LIBS = /your/path/to/espresso/BLAS/blas.a
|
||||
# BLAS_LIBS_SWITCH = internal
|
||||
|
||||
BLAS_LIBS = /ccs/home/spigafi/espresso/GPU/..//phiGEMM/lib/libphigemm.a
|
||||
BLAS_LIBS_SWITCH = external
|
||||
|
||||
# OpenBLAS is used to exploit multi-core CPU if a multi-threaded BLAS
|
||||
# is not used or installed in the system (i.e. MKL is missing)
|
||||
OPENBLAS_INTERNAL = 0
|
||||
|
||||
# If you have nothing better, use the local copy :
|
||||
# LAPACK_LIBS = /your/path/to/espresso/lapack-3.2/lapack.a
|
||||
# LAPACK_LIBS_SWITCH = internal
|
||||
# For IBM machines with essl (-D__ESSL): load essl BEFORE lapack !
|
||||
# remember that LAPACK_LIBS precedes BLAS_LIBS in loading order
|
||||
|
||||
# CBLAS is used in case the C interface for BLAS is missing (i.e. ACML)
|
||||
CBLAS_ENABLED = 0
|
||||
|
||||
LAPACK_LIBS =
|
||||
LAPACK_LIBS_SWITCH = external
|
||||
|
||||
ELPA_LIBS_SWITCH = enabled
|
||||
SCALAPACK_LIBS = $(TOPDIR)/ELPA/libelpa.a
|
||||
|
||||
# nothing needed here if the the internal copy of FFTW is compiled
|
||||
# (needs -D__FFTW in DFLAGS)
|
||||
|
||||
FFT_LIBS =
|
||||
|
||||
# For parallel execution, the correct path to MPI libraries must
|
||||
# be specified in MPI_LIBS (except for IBM if you use mpxlf)
|
||||
|
||||
MPI_LIBS =
|
||||
|
||||
# IBM-specific: MASS libraries, if available and if -D__MASS is defined in FDFLAGS
|
||||
|
||||
MASS_LIBS =
|
||||
|
||||
# ar command and flags - for most architectures: AR = ar, ARFLAGS = ruv
|
||||
|
||||
AR = ar
|
||||
ARFLAGS = ruv
|
||||
|
||||
# ranlib command. If ranlib is not needed (it isn't in most cases) use
|
||||
# RANLIB = echo
|
||||
|
||||
RANLIB = ranlib
|
||||
|
||||
# all internal and external libraries - do not modify
|
||||
|
||||
FLIB_TARGETS = all
|
||||
|
||||
# CUDA section
|
||||
NVCC = /opt/nvidia/cudatoolkit/5.0.35.102/bin/nvcc
|
||||
NVCCFLAGS = -O3 -gencode arch=compute_30,code=sm_35
|
||||
|
||||
PHIGEMM_INTERNAL = 1
|
||||
PHIGEMM_SYMBOLS = 1
|
||||
MAGMA_INTERNAL = 0
|
||||
|
||||
LIBOBJS = ../flib/ptools.a ../flib/flib.a ../clib/clib.a ../iotk/src/libiotk.a
|
||||
LIBS = $(SCALAPACK_LIBS) $(LAPACK_LIBS) $(FFT_LIBS) $(BLAS_LIBS) $(MPI_LIBS) $(MASS_LIBS) $(LD_LIBS)
|
||||
|
||||
# wget or curl - useful to download from network
|
||||
WGET = wget -O
|
|
@ -1,7 +1,4 @@
|
|||
Info by F. Spiga (spiga -dot- filippo -at- gmail -dot- com) -- Jun 19, 2013
|
||||
|
||||
Machine name : TODI at CSCS (CH)
|
||||
Machine spec : http://user.cscs.ch/hardware/todi_cray_xk7/index.html
|
||||
Info by Filippo Spiga -- Sept 18, 2013
|
||||
|
||||
Machine name : TITAN at Oak Ridge National laboratory (USA)
|
||||
Machine spec : https://www.olcf.ornl.gov/computing-resources/titan-cray-xk7/
|
||||
|
@ -10,7 +7,8 @@ Machine spec : https://www.olcf.ornl.gov/computing-resources/titan-cray-xk7/
|
|||
Other CRAY XK7 systems might have different modules, please check for
|
||||
equivalent if the ones mentioned are missing
|
||||
|
||||
|
||||
|
||||
|
||||
0. Architecture peculiarities
|
||||
|
||||
CRAY XK7 systems currently in operation are equipped with one AMD 16-core
|
||||
|
@ -34,7 +32,9 @@ support a resource manager to identify a subset of free nodes in the cluster
|
|||
to minimize hops. Please refer to specific user-guide provided by your HPC
|
||||
centre.
|
||||
|
||||
1. Compile the code
|
||||
|
||||
|
||||
1. Compile GPU-accelerated PWscf
|
||||
|
||||
Up to now, extensive tests proven that Intel compiler is the best choice now
|
||||
to exploit GPU capabilities of QE-GPU on CRAY XK7 nodes.
|
||||
|
@ -46,56 +46,96 @@ is used.
|
|||
Intel:
|
||||
- compile: ok
|
||||
- CPU execution: ok
|
||||
- GPU execution without CRAY_CUDA_PROXY: ok
|
||||
- GPU execution with CRAY_CUDA_PROXY: ok
|
||||
- GPU execution: ok
|
||||
|
||||
PGI, CRAY:
|
||||
PGI:
|
||||
- compile: ok
|
||||
- CPU execution: ok
|
||||
- GPU execution without CRAY_CUDA_PROXY: *KO*
|
||||
- GPU execution with CRAY_CUDA_PROXY: *KO*
|
||||
- GPU execution: ok
|
||||
|
||||
GNU: not tested yet
|
||||
|
||||
# NOTE:
|
||||
Priority will be given to Intel compiler since future generations of CRAY
|
||||
will be equipped with Intel processors (see CRAY XC30).
|
||||
|
||||
_ After login to a system....
|
||||
|
||||
1.2 Titan (ORNL) :
|
||||
module load cudatoolkit
|
||||
module unload atp totalview-support xt-totalview hss-llm
|
||||
|
||||
PGI is the default after login...
|
||||
export FINALDIR=$HOME/whatever
|
||||
mkdir -p $FINALDIR
|
||||
|
||||
$ module switch PrgEnv-pgi PrgEnv-intel
|
||||
$ module load cudatoolkit
|
||||
$ module unload atp totalview-support xt-totalview hss-llm
|
||||
$ module load gcc/4.4.4
|
||||
make -f Makefile.gpu distclean
|
||||
cd GPU/
|
||||
./configure --enable-parallel --enable-openmp --enable-cuda --with-gpu-arch=35 --with-cuda-dir=${CRAY_CUDATOOLKIT_DIR} --without-magma --with-phigemm --with-scalapack ARCH=crayxt
|
||||
cd ../
|
||||
make -f Makefile.gpu pw-gpu
|
||||
cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-scalapack.x
|
||||
|
||||
$ export FINALDIR=$HOME/whatever
|
||||
$ mkdir -p $FINALDIR
|
||||
_or_
|
||||
|
||||
$ make -f Makefile.gpu distclean
|
||||
$ cd GPU/
|
||||
$ ./configure --enable-parallel --enable-openmp --enable-cuda --with-gpu-arch=35 --with-cuda-dir=${CRAY_CUDATOOLKIT_DIR} --disable-magma --enable-phigemm --with-scalapack --disable-profiling ARCH=crayxt
|
||||
$ cd ../
|
||||
$ make -f Makefile.gpu pw-gpu
|
||||
$ cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-scalapack.x
|
||||
make -f Makefile.gpu distclean
|
||||
cd GPU/
|
||||
./configure --enable-parallel --enable-openmp --enable-cuda --with-gpu-arch=35 --with-cuda-dir=${CRAY_CUDATOOLKIT_DIR} --without-magma --with-phigemm --with-scalapack --with-elpa ARCH=crayxt
|
||||
cd ../
|
||||
make -f Makefile.gpu pw-gpu
|
||||
cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-elpa.x
|
||||
|
||||
|
||||
IMPORTANT NOTE (1): in case of PGI compiler, it is possible to compile a light version
|
||||
of the executable by avoiding CRAY wrappers. Look at the file
|
||||
"" as example.
|
||||
|
||||
IMPORTANT NOTE (2): On TITAN the PGI compiler is the default right after login. If
|
||||
Intel is the preferred choice, do
|
||||
$ module switch PrgEnv-pgi PrgEnv-intel
|
||||
immediately after the login.
|
||||
|
||||
|
||||
|
||||
3. Running (using TORQUE/PBS Pro)
|
||||
|
||||
Refer to this link
|
||||
https://www.olcf.ornl.gov/kb_articles/spider-the-center-wide-lustre-file-system/
|
||||
about the filesystem management and availability
|
||||
|
||||
IMPORTANT NOTE (1):
|
||||
The per node charging factor changed from 16 to 30 to reflect the availability
|
||||
of GPU/Accelerators. Job utilization is now calculated via the formula:
|
||||
30 * wall-time * number of requested nodes
|
||||
|
||||
IMPORTANT NOTE (2):
|
||||
Project granted at ORNL usually have 3 letters (XXX) and three digits (YYY)
|
||||
|
||||
|
||||
#!/bin/csh
|
||||
#
|
||||
# Example requesting 8 nodes (64 cores in total in SINGLE STREAM MODE
|
||||
# using 8 OpenMP thread per MPI), 1 MPI process per node (8 in total)
|
||||
# dedicating the full NVIDIA K20x resource to a single MPI process.
|
||||
#
|
||||
#PBS -A <XXXYYY>
|
||||
#PBS -N QE-BENCH
|
||||
#PBS -j oe
|
||||
#PBS -l walltime=1:00:00,nodes=8
|
||||
|
||||
cd $PBS_O_WORKDIR
|
||||
|
||||
# Enable _only_ if '-N' > 1 (see below)
|
||||
#setenv CRAY_CUDA_PROXY 1
|
||||
|
||||
# DEBUG
|
||||
#setenv MPICH_ENV_DISPLAY 1
|
||||
|
||||
# _REMEMBER_
|
||||
# '-n' : number of PEs or total MPI processes
|
||||
# '-d' : number OpenMP thread per node
|
||||
# '-N' : number of MPI processes per node
|
||||
# '-j' :
|
||||
|
||||
setenv OMP_NUM_THREADS 8
|
||||
aprun -N 1 -n 8 -j 1 -d 8 -cc numa_node ./pw-mpi-omp-gpu.x -input ausurf_gamma.in | tee out.GPU.1-PER-NODE.$PBS_JOBID.v1
|
||||
|
||||
$ make -f Makefile.gpu distclean
|
||||
$ cd GPU/
|
||||
$ ./configure --enable-parallel --enable-openmp --enable-cuda --with-gpu-arch=35 --with-cuda-dir=${CRAY_CUDATOOLKIT_DIR} --disable-magma --enable-phigemm --with-scalapack --disable-profiling --with-elpa ARCH=crayxt
|
||||
$ cd ../
|
||||
$ make -f Makefile.gpu pw-gpu
|
||||
$ cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-elpa.x
|
||||
|
||||
$ make -f Makefile.gpu distclean
|
||||
$ cd GPU/
|
||||
$ ./configure --enable-parallel --enable-openmp --enable-cuda --with-gpu-arch=35 --with-cuda-dir=${CRAY_CUDATOOLKIT_DIR} --enable-magma --enable-phigemm --without-scalapack --disable-profiling --with-internal-cblas ARCH=crayxt
|
||||
$ cd ../
|
||||
$ make -f Makefile.gpu pw-gpu
|
||||
$ cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-magma.x
|
||||
|
||||
|
||||
2. Good practices
|
||||
|
||||
- Each NVIDIA Tesla K20 GPU has 6 GB of memory on the card. Better to limit
|
||||
|
@ -108,91 +148,11 @@ $ cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-magma.x
|
|||
- In order to share the GPU between multiple MPI processes within the node is
|
||||
mandatory to export the variable CRAY_CUDA_PROXY ("export CRAY_CUDA_PROXY=1")
|
||||
|
||||
- compiling with hugepage support does not produce big benefits, need
|
||||
- compiling with huge-pages support does not produce big benefits, need
|
||||
more testing...
|
||||
|
||||
|
||||
3 Example scripts (CSCS, SLURM)
|
||||
|
||||
3.1 TODI (SLURM)
|
||||
|
||||
#!/bin/bash
|
||||
|
||||
# Example requesting 4 nodes (32 cores in total in SINGLE STREAM MODE
|
||||
# using 4 OpenMP thread per MPI), 2 MPI process per node (8 in total)
|
||||
# sharing the NVIDIA K20x among them.
|
||||
|
||||
#SBATCH --job-name="QE-TEST"
|
||||
#SBATCH --nodes=4
|
||||
#SBATCH --time=00:25:00
|
||||
#SBATCH --output=QE-BENCH.%j.o
|
||||
#SBATCH --error=QE-BENCH.%j.e
|
||||
#SBATCH --account=<...>
|
||||
|
||||
export CRAY_CUDA_PROXY=1
|
||||
|
||||
#export MALLOC_MMAP_MAX_=0
|
||||
#export MALLOC_TRIM_THRESHOLD_=536870912
|
||||
|
||||
export MPICH_VERSION_DISPLAY=1
|
||||
export MPICH_ENV_DISPLAY=1
|
||||
export MPICH_CPUMASK_DISPLAY=1
|
||||
|
||||
# REMEMBER...
|
||||
# '-n' : number of PEs or total MPI processes
|
||||
# '-d' : number OpenMP thread per node
|
||||
|
||||
#export OMP_NUM_THREADS=8
|
||||
#aprun -n 4 -j 1 -d 8 ./pw-mpi-omp-gpu.x -input ausurf_gamma.in | tee out.GPU.1-PER-NODE.$SLURM_JOB_ID.v1
|
||||
|
||||
export OMP_NUM_THREADS=4
|
||||
aprun -n 8 -j 1 -d 4 ./pw-mpi-omp-gpu.x -input ausurf_gamma.in | tee out.GPU.2-PER-NODE.$SLURM_JOB_ID.v1
|
||||
|
||||
#export OMP_NUM_THREADS=2
|
||||
#aprun -n 16 -j 1 -d 2 ./pw-mpi-omp-gpu.x -input ausurf_gamma.in | tee out.GPU.4-PER-NODE.$SLURM_JOB_ID.v1
|
||||
|
||||
|
||||
3.2 TITAN (PBS Pro)
|
||||
|
||||
# IMPORTANT NOTE (1):
|
||||
The per node charging factor changed from 16 to 30 to reflect the availability
|
||||
of GPU/Accelerators. Job utilization is now calculated via the formula:
|
||||
30 * walltime * number of requested nodes
|
||||
|
||||
# IMPORTANT NOTE (2):
|
||||
Project granted at ORNL usually have 3 letters (XXX) and three digits (YYY)
|
||||
|
||||
#!/bin/csh
|
||||
|
||||
# Example requesting 8 nodes (64 cores in total in SINGLE STREAM MODE
|
||||
# using 8 OpenMP thread per MPI), 1 MPI process per node (8 in total)
|
||||
# dedicating the full NVIDIA K20x resource to a single MPI process.
|
||||
|
||||
#PBS -A <XXXYYY>
|
||||
#PBS -N QE-BENCH
|
||||
#PBS -j oe
|
||||
#PBS -l walltime=1:00:00,nodes=8
|
||||
|
||||
cd $PBS_O_WORKDIR
|
||||
|
||||
|
||||
setenv CRAY_CUDA_PROXY 1
|
||||
setenv MPICH_ENV_DISPLAY 1
|
||||
|
||||
# REMEMBER...
|
||||
# '-n' : number of PEs or total MPI processes
|
||||
# '-d' : number OpenMP thread per node
|
||||
|
||||
setenv OMP_NUM_THREADS 8
|
||||
aprun -n 8 -j 1 -d 8 ./pw-mpi-omp-gpu.x -input ausurf_gamma.in | tee out.GPU.1-PER-NODE.$PBS_JOBID.v1
|
||||
|
||||
|
||||
# IMPORTANT NOTE:
|
||||
refer to this link
|
||||
https://www.olcf.ornl.gov/kb_articles/spider-the-center-wide-lustre-file-system/
|
||||
about the fileystem management and availability
|
||||
|
||||
|
||||
4. Benchmarks
|
||||
5. Benchmarks
|
||||
|
||||
[TO BE ADDED]
|
Loading…
Reference in New Issue