Compile and run QE-GPU on CRAY XK7 using PGI compilers -- updates

git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@10459 c92efa57-630b-4861-b058-cf58834340f0
This commit is contained in:
spigafi 2013-09-18 21:46:47 +00:00
parent 6d6e8ccaff
commit c374d873d6
2 changed files with 256 additions and 124 deletions

172
install/Make.CRAY-XK7 Normal file
View File

@ -0,0 +1,172 @@
# make.sys. Generated from make.sys.in by configure.
# compilation rules
.SUFFIXES :
.SUFFIXES : .o .c .f .f90 .cu
# most fortran compilers can directly preprocess c-like directives: use
# $(MPIF90) $(F90FLAGS) -c $<
# if explicit preprocessing by the C preprocessor is needed, use:
# $(CPP) $(CPPFLAGS) $< -o $*.F90
# $(MPIF90) $(F90FLAGS) -c $*.F90 -o $*.o
# remember the tabulator in the first column !!!
.f90.o:
$(MPIF90) $(F90FLAGS) -c $<
# .f.o and .c.o: do not modify
.f.o:
$(F77) $(FFLAGS) -c $<
.c.o:
$(CC) $(CFLAGS) -c $<
# CUDA files
.cu.o:
$(NVCC) $(NVCCFLAGS) -I../../include $(IFLAGS) $(DFLAGS) -c $<
# topdir for linking espresso libs with plugins
TOPDIR = /ccs/home/spigafi/espresso/GPU/../
# DFLAGS = precompilation options (possible arguments to -D and -U)
# used by the C compiler and preprocessor
# FDFLAGS = as DFLAGS, for the f90 compiler
# See include/defs.h.README for a list of options and their meaning
# With the exception of IBM xlf, FDFLAGS = $(DFLAGS)
# For IBM xlf, FDFLAGS is the same as DFLAGS with separating commas
# MANUAL_DFLAGS = additional precompilation option(s), if desired
# You may use this instead of tweaking DFLAGS and FDFLAGS
# BEWARE: will not work for IBM xlf! Manually edit FDFLAGS
MANUAL_DFLAGS =
DFLAGS = -D__PGI -D__IOTK_SAFEST -D__FFTW -D__MPI -D__PARA -D__SCALAPACK -D__CUDA -D__OPENMP -D__ELPA -D__PHIGEMM $(MANUAL_DFLAGS)
FDFLAGS = $(DFLAGS) $(MANUAL_DFLAGS)
# IFLAGS = how to locate directories where files to be included are
# In most cases, IFLAGS = -I../include
IFLAGS = -I$(MPICH_DIR)/include -I../include -I/ccs/home/spigafi/espresso/GPU/..//phiGEMM/include $(CRAY_CUDATOOLKIT_INCLUDE_OPTS)
# MOD_FLAGS = flag used by f90 compiler to locate modules
# Each Makefile defines the list of needed modules in MODFLAGS
MOD_FLAG = -I
# Compilers: fortran-90, fortran-77, C
# If a parallel compilation is desired, MPIF90 should be a fortran-90
# compiler that produces executables for parallel execution using MPI
# (such as for instance mpif90, mpf90, mpxlf90,...);
# otherwise, an ordinary fortran-90 compiler (f90, g95, xlf90, ifort,...)
# If you have a parallel machine but no suitable candidate for MPIF90,
# try to specify the directory containing "mpif.h" in IFLAGS
# and to specify the location of MPI libraries in MPI_LIBS
MPIF90 = pgf90
#F90 = pgf90
CC = pgcc
F77 = pgf77
# C preprocessor and preprocessing flags - for explicit preprocessing,
# if needed (see the compilation rules above)
# preprocessing flags must include DFLAGS and IFLAGS
CPP = cpp
CPPFLAGS = -P -traditional $(DFLAGS) $(IFLAGS)
# compiler flags: C, F90, F77
# C flags must include DFLAGS and IFLAGS
# F90 flags must include MODFLAGS, IFLAGS, and FDFLAGS with appropriate syntax
CFLAGS = -O3 -D__align__\(n\)=__attribute__\(\(aligned\(n\)\)\) -D__location__\(a\)=__annotate__\(a\) -DCUDARTAPI= -D__x86_64 $(DFLAGS) $(IFLAGS)
F90FLAGS = -O3 -Mcache_align -r8 -Mpreprocess -mp=nonuma $(FDFLAGS) $(IFLAGS) $(MODFLAGS)
FFLAGS = -O3 -r8 -mp=nonuma
# compiler flags without optimization for fortran-77
# the latter is NEEDED to properly compile dlamch.f, used by lapack
FFLAGS_NOOPT = -O0
# compiler flag needed by some compilers when the main is not fortran
# Currently used for Yambo
FFLAGS_NOMAIN = -Mnomain
# Linker, linker-specific flags (if any)
# Typically LD coincides with F90 or MPIF90, LD_LIBS is empty
LD = pgf90
LDFLAGS = -v -mp=nonuma
LD_LIBS = $(MPICH_DIR)/lib/libmpich_pgi.so /opt/cray/libsci/12.0.00/pgi/119/interlagos/lib/libsci_pgi_mp.a $(CRAY_CUDATOOLKIT_POST_LINK_OPTS) -lcublas -lcufft
# External Libraries (if any) : blas, lapack, fft, MPI
# If you have nothing better, use the local copy :
# BLAS_LIBS = /your/path/to/espresso/BLAS/blas.a
# BLAS_LIBS_SWITCH = internal
BLAS_LIBS = /ccs/home/spigafi/espresso/GPU/..//phiGEMM/lib/libphigemm.a
BLAS_LIBS_SWITCH = external
# OpenBLAS is used to exploit multi-core CPU if a multi-threaded BLAS
# is not used or installed in the system (i.e. MKL is missing)
OPENBLAS_INTERNAL = 0
# If you have nothing better, use the local copy :
# LAPACK_LIBS = /your/path/to/espresso/lapack-3.2/lapack.a
# LAPACK_LIBS_SWITCH = internal
# For IBM machines with essl (-D__ESSL): load essl BEFORE lapack !
# remember that LAPACK_LIBS precedes BLAS_LIBS in loading order
# CBLAS is used in case the C interface for BLAS is missing (i.e. ACML)
CBLAS_ENABLED = 0
LAPACK_LIBS =
LAPACK_LIBS_SWITCH = external
ELPA_LIBS_SWITCH = enabled
SCALAPACK_LIBS = $(TOPDIR)/ELPA/libelpa.a
# nothing needed here if the the internal copy of FFTW is compiled
# (needs -D__FFTW in DFLAGS)
FFT_LIBS =
# For parallel execution, the correct path to MPI libraries must
# be specified in MPI_LIBS (except for IBM if you use mpxlf)
MPI_LIBS =
# IBM-specific: MASS libraries, if available and if -D__MASS is defined in FDFLAGS
MASS_LIBS =
# ar command and flags - for most architectures: AR = ar, ARFLAGS = ruv
AR = ar
ARFLAGS = ruv
# ranlib command. If ranlib is not needed (it isn't in most cases) use
# RANLIB = echo
RANLIB = ranlib
# all internal and external libraries - do not modify
FLIB_TARGETS = all
# CUDA section
NVCC = /opt/nvidia/cudatoolkit/5.0.35.102/bin/nvcc
NVCCFLAGS = -O3 -gencode arch=compute_30,code=sm_35
PHIGEMM_INTERNAL = 1
PHIGEMM_SYMBOLS = 1
MAGMA_INTERNAL = 0
LIBOBJS = ../flib/ptools.a ../flib/flib.a ../clib/clib.a ../iotk/src/libiotk.a
LIBS = $(SCALAPACK_LIBS) $(LAPACK_LIBS) $(FFT_LIBS) $(BLAS_LIBS) $(MPI_LIBS) $(MASS_LIBS) $(LD_LIBS)
# wget or curl - useful to download from network
WGET = wget -O

View File

@ -1,7 +1,4 @@
Info by F. Spiga (spiga -dot- filippo -at- gmail -dot- com) -- Jun 19, 2013
Machine name : TODI at CSCS (CH)
Machine spec : http://user.cscs.ch/hardware/todi_cray_xk7/index.html
Info by Filippo Spiga -- Sept 18, 2013
Machine name : TITAN at Oak Ridge National laboratory (USA)
Machine spec : https://www.olcf.ornl.gov/computing-resources/titan-cray-xk7/
@ -10,7 +7,8 @@ Machine spec : https://www.olcf.ornl.gov/computing-resources/titan-cray-xk7/
Other CRAY XK7 systems might have different modules, please check for
equivalent if the ones mentioned are missing
0. Architecture peculiarities
CRAY XK7 systems currently in operation are equipped with one AMD 16-core
@ -34,7 +32,9 @@ support a resource manager to identify a subset of free nodes in the cluster
to minimize hops. Please refer to specific user-guide provided by your HPC
centre.
1. Compile the code
1. Compile GPU-accelerated PWscf
Up to now, extensive tests proven that Intel compiler is the best choice now
to exploit GPU capabilities of QE-GPU on CRAY XK7 nodes.
@ -46,56 +46,96 @@ is used.
Intel:
- compile: ok
- CPU execution: ok
- GPU execution without CRAY_CUDA_PROXY: ok
- GPU execution with CRAY_CUDA_PROXY: ok
- GPU execution: ok
PGI, CRAY:
PGI:
- compile: ok
- CPU execution: ok
- GPU execution without CRAY_CUDA_PROXY: *KO*
- GPU execution with CRAY_CUDA_PROXY: *KO*
- GPU execution: ok
GNU: not tested yet
# NOTE:
Priority will be given to Intel compiler since future generations of CRAY
will be equipped with Intel processors (see CRAY XC30).
_ After login to a system....
1.2 Titan (ORNL) :
module load cudatoolkit
module unload atp totalview-support xt-totalview hss-llm
PGI is the default after login...
export FINALDIR=$HOME/whatever
mkdir -p $FINALDIR
$ module switch PrgEnv-pgi PrgEnv-intel
$ module load cudatoolkit
$ module unload atp totalview-support xt-totalview hss-llm
$ module load gcc/4.4.4
make -f Makefile.gpu distclean
cd GPU/
./configure --enable-parallel --enable-openmp --enable-cuda --with-gpu-arch=35 --with-cuda-dir=${CRAY_CUDATOOLKIT_DIR} --without-magma --with-phigemm --with-scalapack ARCH=crayxt
cd ../
make -f Makefile.gpu pw-gpu
cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-scalapack.x
$ export FINALDIR=$HOME/whatever
$ mkdir -p $FINALDIR
_or_
$ make -f Makefile.gpu distclean
$ cd GPU/
$ ./configure --enable-parallel --enable-openmp --enable-cuda --with-gpu-arch=35 --with-cuda-dir=${CRAY_CUDATOOLKIT_DIR} --disable-magma --enable-phigemm --with-scalapack --disable-profiling ARCH=crayxt
$ cd ../
$ make -f Makefile.gpu pw-gpu
$ cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-scalapack.x
make -f Makefile.gpu distclean
cd GPU/
./configure --enable-parallel --enable-openmp --enable-cuda --with-gpu-arch=35 --with-cuda-dir=${CRAY_CUDATOOLKIT_DIR} --without-magma --with-phigemm --with-scalapack --with-elpa ARCH=crayxt
cd ../
make -f Makefile.gpu pw-gpu
cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-elpa.x
IMPORTANT NOTE (1): in case of PGI compiler, it is possible to compile a light version
of the executable by avoiding CRAY wrappers. Look at the file
"" as example.
IMPORTANT NOTE (2): On TITAN the PGI compiler is the default right after login. If
Intel is the preferred choice, do
$ module switch PrgEnv-pgi PrgEnv-intel
immediately after the login.
3. Running (using TORQUE/PBS Pro)
Refer to this link
https://www.olcf.ornl.gov/kb_articles/spider-the-center-wide-lustre-file-system/
about the filesystem management and availability
IMPORTANT NOTE (1):
The per node charging factor changed from 16 to 30 to reflect the availability
of GPU/Accelerators. Job utilization is now calculated via the formula:
30 * wall-time * number of requested nodes
IMPORTANT NOTE (2):
Project granted at ORNL usually have 3 letters (XXX) and three digits (YYY)
#!/bin/csh
#
# Example requesting 8 nodes (64 cores in total in SINGLE STREAM MODE
# using 8 OpenMP thread per MPI), 1 MPI process per node (8 in total)
# dedicating the full NVIDIA K20x resource to a single MPI process.
#
#PBS -A <XXXYYY>
#PBS -N QE-BENCH
#PBS -j oe
#PBS -l walltime=1:00:00,nodes=8
cd $PBS_O_WORKDIR
# Enable _only_ if '-N' > 1 (see below)
#setenv CRAY_CUDA_PROXY 1
# DEBUG
#setenv MPICH_ENV_DISPLAY 1
# _REMEMBER_
# '-n' : number of PEs or total MPI processes
# '-d' : number OpenMP thread per node
# '-N' : number of MPI processes per node
# '-j' :
setenv OMP_NUM_THREADS 8
aprun -N 1 -n 8 -j 1 -d 8 -cc numa_node ./pw-mpi-omp-gpu.x -input ausurf_gamma.in | tee out.GPU.1-PER-NODE.$PBS_JOBID.v1
$ make -f Makefile.gpu distclean
$ cd GPU/
$ ./configure --enable-parallel --enable-openmp --enable-cuda --with-gpu-arch=35 --with-cuda-dir=${CRAY_CUDATOOLKIT_DIR} --disable-magma --enable-phigemm --with-scalapack --disable-profiling --with-elpa ARCH=crayxt
$ cd ../
$ make -f Makefile.gpu pw-gpu
$ cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-elpa.x
$ make -f Makefile.gpu distclean
$ cd GPU/
$ ./configure --enable-parallel --enable-openmp --enable-cuda --with-gpu-arch=35 --with-cuda-dir=${CRAY_CUDATOOLKIT_DIR} --enable-magma --enable-phigemm --without-scalapack --disable-profiling --with-internal-cblas ARCH=crayxt
$ cd ../
$ make -f Makefile.gpu pw-gpu
$ cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-magma.x
2. Good practices
- Each NVIDIA Tesla K20 GPU has 6 GB of memory on the card. Better to limit
@ -108,91 +148,11 @@ $ cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-magma.x
- In order to share the GPU between multiple MPI processes within the node is
mandatory to export the variable CRAY_CUDA_PROXY ("export CRAY_CUDA_PROXY=1")
- compiling with hugepage support does not produce big benefits, need
- compiling with huge-pages support does not produce big benefits, need
more testing...
3 Example scripts (CSCS, SLURM)
3.1 TODI (SLURM)
#!/bin/bash
# Example requesting 4 nodes (32 cores in total in SINGLE STREAM MODE
# using 4 OpenMP thread per MPI), 2 MPI process per node (8 in total)
# sharing the NVIDIA K20x among them.
#SBATCH --job-name="QE-TEST"
#SBATCH --nodes=4
#SBATCH --time=00:25:00
#SBATCH --output=QE-BENCH.%j.o
#SBATCH --error=QE-BENCH.%j.e
#SBATCH --account=<...>
export CRAY_CUDA_PROXY=1
#export MALLOC_MMAP_MAX_=0
#export MALLOC_TRIM_THRESHOLD_=536870912
export MPICH_VERSION_DISPLAY=1
export MPICH_ENV_DISPLAY=1
export MPICH_CPUMASK_DISPLAY=1
# REMEMBER...
# '-n' : number of PEs or total MPI processes
# '-d' : number OpenMP thread per node
#export OMP_NUM_THREADS=8
#aprun -n 4 -j 1 -d 8 ./pw-mpi-omp-gpu.x -input ausurf_gamma.in | tee out.GPU.1-PER-NODE.$SLURM_JOB_ID.v1
export OMP_NUM_THREADS=4
aprun -n 8 -j 1 -d 4 ./pw-mpi-omp-gpu.x -input ausurf_gamma.in | tee out.GPU.2-PER-NODE.$SLURM_JOB_ID.v1
#export OMP_NUM_THREADS=2
#aprun -n 16 -j 1 -d 2 ./pw-mpi-omp-gpu.x -input ausurf_gamma.in | tee out.GPU.4-PER-NODE.$SLURM_JOB_ID.v1
3.2 TITAN (PBS Pro)
# IMPORTANT NOTE (1):
The per node charging factor changed from 16 to 30 to reflect the availability
of GPU/Accelerators. Job utilization is now calculated via the formula:
30 * walltime * number of requested nodes
# IMPORTANT NOTE (2):
Project granted at ORNL usually have 3 letters (XXX) and three digits (YYY)
#!/bin/csh
# Example requesting 8 nodes (64 cores in total in SINGLE STREAM MODE
# using 8 OpenMP thread per MPI), 1 MPI process per node (8 in total)
# dedicating the full NVIDIA K20x resource to a single MPI process.
#PBS -A <XXXYYY>
#PBS -N QE-BENCH
#PBS -j oe
#PBS -l walltime=1:00:00,nodes=8
cd $PBS_O_WORKDIR
setenv CRAY_CUDA_PROXY 1
setenv MPICH_ENV_DISPLAY 1
# REMEMBER...
# '-n' : number of PEs or total MPI processes
# '-d' : number OpenMP thread per node
setenv OMP_NUM_THREADS 8
aprun -n 8 -j 1 -d 8 ./pw-mpi-omp-gpu.x -input ausurf_gamma.in | tee out.GPU.1-PER-NODE.$PBS_JOBID.v1
# IMPORTANT NOTE:
refer to this link
https://www.olcf.ornl.gov/kb_articles/spider-the-center-wide-lustre-file-system/
about the fileystem management and availability
4. Benchmarks
5. Benchmarks
[TO BE ADDED]