From c374d873d62767916d5c93df0342eaa1153f6d40 Mon Sep 17 00:00:00 2001 From: spigafi Date: Wed, 18 Sep 2013 21:46:47 +0000 Subject: [PATCH] Compile and run QE-GPU on CRAY XK7 using PGI compilers -- updates git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@10459 c92efa57-630b-4861-b058-cf58834340f0 --- install/Make.CRAY-XK7 | 172 +++++++++++++++++++++++++++++++++ install/README.CRAY-XK7 | 208 ++++++++++++++++------------------------ 2 files changed, 256 insertions(+), 124 deletions(-) create mode 100644 install/Make.CRAY-XK7 diff --git a/install/Make.CRAY-XK7 b/install/Make.CRAY-XK7 new file mode 100644 index 000000000..e851ebd5e --- /dev/null +++ b/install/Make.CRAY-XK7 @@ -0,0 +1,172 @@ +# make.sys. Generated from make.sys.in by configure. + +# compilation rules + +.SUFFIXES : +.SUFFIXES : .o .c .f .f90 .cu + +# most fortran compilers can directly preprocess c-like directives: use +# $(MPIF90) $(F90FLAGS) -c $< +# if explicit preprocessing by the C preprocessor is needed, use: +# $(CPP) $(CPPFLAGS) $< -o $*.F90 +# $(MPIF90) $(F90FLAGS) -c $*.F90 -o $*.o +# remember the tabulator in the first column !!! + +.f90.o: + $(MPIF90) $(F90FLAGS) -c $< + +# .f.o and .c.o: do not modify + +.f.o: + $(F77) $(FFLAGS) -c $< + +.c.o: + $(CC) $(CFLAGS) -c $< + +# CUDA files +.cu.o: + $(NVCC) $(NVCCFLAGS) -I../../include $(IFLAGS) $(DFLAGS) -c $< + +# topdir for linking espresso libs with plugins +TOPDIR = /ccs/home/spigafi/espresso/GPU/../ + + +# DFLAGS = precompilation options (possible arguments to -D and -U) +# used by the C compiler and preprocessor +# FDFLAGS = as DFLAGS, for the f90 compiler +# See include/defs.h.README for a list of options and their meaning +# With the exception of IBM xlf, FDFLAGS = $(DFLAGS) +# For IBM xlf, FDFLAGS is the same as DFLAGS with separating commas + +# MANUAL_DFLAGS = additional precompilation option(s), if desired +# You may use this instead of tweaking DFLAGS and FDFLAGS +# BEWARE: will not work for IBM xlf! Manually edit FDFLAGS +MANUAL_DFLAGS = +DFLAGS = -D__PGI -D__IOTK_SAFEST -D__FFTW -D__MPI -D__PARA -D__SCALAPACK -D__CUDA -D__OPENMP -D__ELPA -D__PHIGEMM $(MANUAL_DFLAGS) +FDFLAGS = $(DFLAGS) $(MANUAL_DFLAGS) + +# IFLAGS = how to locate directories where files to be included are +# In most cases, IFLAGS = -I../include + +IFLAGS = -I$(MPICH_DIR)/include -I../include -I/ccs/home/spigafi/espresso/GPU/..//phiGEMM/include $(CRAY_CUDATOOLKIT_INCLUDE_OPTS) + +# MOD_FLAGS = flag used by f90 compiler to locate modules +# Each Makefile defines the list of needed modules in MODFLAGS + +MOD_FLAG = -I + +# Compilers: fortran-90, fortran-77, C +# If a parallel compilation is desired, MPIF90 should be a fortran-90 +# compiler that produces executables for parallel execution using MPI +# (such as for instance mpif90, mpf90, mpxlf90,...); +# otherwise, an ordinary fortran-90 compiler (f90, g95, xlf90, ifort,...) +# If you have a parallel machine but no suitable candidate for MPIF90, +# try to specify the directory containing "mpif.h" in IFLAGS +# and to specify the location of MPI libraries in MPI_LIBS + +MPIF90 = pgf90 +#F90 = pgf90 +CC = pgcc +F77 = pgf77 + +# C preprocessor and preprocessing flags - for explicit preprocessing, +# if needed (see the compilation rules above) +# preprocessing flags must include DFLAGS and IFLAGS + +CPP = cpp +CPPFLAGS = -P -traditional $(DFLAGS) $(IFLAGS) + +# compiler flags: C, F90, F77 +# C flags must include DFLAGS and IFLAGS +# F90 flags must include MODFLAGS, IFLAGS, and FDFLAGS with appropriate syntax + +CFLAGS = -O3 -D__align__\(n\)=__attribute__\(\(aligned\(n\)\)\) -D__location__\(a\)=__annotate__\(a\) -DCUDARTAPI= -D__x86_64 $(DFLAGS) $(IFLAGS) +F90FLAGS = -O3 -Mcache_align -r8 -Mpreprocess -mp=nonuma $(FDFLAGS) $(IFLAGS) $(MODFLAGS) +FFLAGS = -O3 -r8 -mp=nonuma + +# compiler flags without optimization for fortran-77 +# the latter is NEEDED to properly compile dlamch.f, used by lapack + +FFLAGS_NOOPT = -O0 + +# compiler flag needed by some compilers when the main is not fortran +# Currently used for Yambo + +FFLAGS_NOMAIN = -Mnomain + +# Linker, linker-specific flags (if any) +# Typically LD coincides with F90 or MPIF90, LD_LIBS is empty + +LD = pgf90 +LDFLAGS = -v -mp=nonuma +LD_LIBS = $(MPICH_DIR)/lib/libmpich_pgi.so /opt/cray/libsci/12.0.00/pgi/119/interlagos/lib/libsci_pgi_mp.a $(CRAY_CUDATOOLKIT_POST_LINK_OPTS) -lcublas -lcufft + +# External Libraries (if any) : blas, lapack, fft, MPI + +# If you have nothing better, use the local copy : +# BLAS_LIBS = /your/path/to/espresso/BLAS/blas.a +# BLAS_LIBS_SWITCH = internal + +BLAS_LIBS = /ccs/home/spigafi/espresso/GPU/..//phiGEMM/lib/libphigemm.a +BLAS_LIBS_SWITCH = external + +# OpenBLAS is used to exploit multi-core CPU if a multi-threaded BLAS +# is not used or installed in the system (i.e. MKL is missing) +OPENBLAS_INTERNAL = 0 + +# If you have nothing better, use the local copy : +# LAPACK_LIBS = /your/path/to/espresso/lapack-3.2/lapack.a +# LAPACK_LIBS_SWITCH = internal +# For IBM machines with essl (-D__ESSL): load essl BEFORE lapack ! +# remember that LAPACK_LIBS precedes BLAS_LIBS in loading order + +# CBLAS is used in case the C interface for BLAS is missing (i.e. ACML) +CBLAS_ENABLED = 0 + +LAPACK_LIBS = +LAPACK_LIBS_SWITCH = external + +ELPA_LIBS_SWITCH = enabled +SCALAPACK_LIBS = $(TOPDIR)/ELPA/libelpa.a + +# nothing needed here if the the internal copy of FFTW is compiled +# (needs -D__FFTW in DFLAGS) + +FFT_LIBS = + +# For parallel execution, the correct path to MPI libraries must +# be specified in MPI_LIBS (except for IBM if you use mpxlf) + +MPI_LIBS = + +# IBM-specific: MASS libraries, if available and if -D__MASS is defined in FDFLAGS + +MASS_LIBS = + +# ar command and flags - for most architectures: AR = ar, ARFLAGS = ruv + +AR = ar +ARFLAGS = ruv + +# ranlib command. If ranlib is not needed (it isn't in most cases) use +# RANLIB = echo + +RANLIB = ranlib + +# all internal and external libraries - do not modify + +FLIB_TARGETS = all + +# CUDA section +NVCC = /opt/nvidia/cudatoolkit/5.0.35.102/bin/nvcc +NVCCFLAGS = -O3 -gencode arch=compute_30,code=sm_35 + +PHIGEMM_INTERNAL = 1 +PHIGEMM_SYMBOLS = 1 +MAGMA_INTERNAL = 0 + +LIBOBJS = ../flib/ptools.a ../flib/flib.a ../clib/clib.a ../iotk/src/libiotk.a +LIBS = $(SCALAPACK_LIBS) $(LAPACK_LIBS) $(FFT_LIBS) $(BLAS_LIBS) $(MPI_LIBS) $(MASS_LIBS) $(LD_LIBS) + +# wget or curl - useful to download from network +WGET = wget -O diff --git a/install/README.CRAY-XK7 b/install/README.CRAY-XK7 index 9309b2156..fed2268e2 100644 --- a/install/README.CRAY-XK7 +++ b/install/README.CRAY-XK7 @@ -1,7 +1,4 @@ -Info by F. Spiga (spiga -dot- filippo -at- gmail -dot- com) -- Jun 19, 2013 - -Machine name : TODI at CSCS (CH) -Machine spec : http://user.cscs.ch/hardware/todi_cray_xk7/index.html +Info by Filippo Spiga -- Sept 18, 2013 Machine name : TITAN at Oak Ridge National laboratory (USA) Machine spec : https://www.olcf.ornl.gov/computing-resources/titan-cray-xk7/ @@ -10,7 +7,8 @@ Machine spec : https://www.olcf.ornl.gov/computing-resources/titan-cray-xk7/ Other CRAY XK7 systems might have different modules, please check for equivalent if the ones mentioned are missing - + + 0. Architecture peculiarities CRAY XK7 systems currently in operation are equipped with one AMD 16-core @@ -34,7 +32,9 @@ support a resource manager to identify a subset of free nodes in the cluster to minimize hops. Please refer to specific user-guide provided by your HPC centre. -1. Compile the code + + +1. Compile GPU-accelerated PWscf Up to now, extensive tests proven that Intel compiler is the best choice now to exploit GPU capabilities of QE-GPU on CRAY XK7 nodes. @@ -46,56 +46,96 @@ is used. Intel: - compile: ok - CPU execution: ok -- GPU execution without CRAY_CUDA_PROXY: ok -- GPU execution with CRAY_CUDA_PROXY: ok +- GPU execution: ok -PGI, CRAY: +PGI: - compile: ok - CPU execution: ok -- GPU execution without CRAY_CUDA_PROXY: *KO* -- GPU execution with CRAY_CUDA_PROXY: *KO* +- GPU execution: ok GNU: not tested yet - -# NOTE: -Priority will be given to Intel compiler since future generations of CRAY -will be equipped with Intel processors (see CRAY XC30). +_ After login to a system.... -1.2 Titan (ORNL) : +module load cudatoolkit +module unload atp totalview-support xt-totalview hss-llm -PGI is the default after login... +export FINALDIR=$HOME/whatever +mkdir -p $FINALDIR -$ module switch PrgEnv-pgi PrgEnv-intel -$ module load cudatoolkit -$ module unload atp totalview-support xt-totalview hss-llm -$ module load gcc/4.4.4 +make -f Makefile.gpu distclean +cd GPU/ +./configure --enable-parallel --enable-openmp --enable-cuda --with-gpu-arch=35 --with-cuda-dir=${CRAY_CUDATOOLKIT_DIR} --without-magma --with-phigemm --with-scalapack ARCH=crayxt +cd ../ +make -f Makefile.gpu pw-gpu +cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-scalapack.x -$ export FINALDIR=$HOME/whatever -$ mkdir -p $FINALDIR +_or_ -$ make -f Makefile.gpu distclean -$ cd GPU/ -$ ./configure --enable-parallel --enable-openmp --enable-cuda --with-gpu-arch=35 --with-cuda-dir=${CRAY_CUDATOOLKIT_DIR} --disable-magma --enable-phigemm --with-scalapack --disable-profiling ARCH=crayxt -$ cd ../ -$ make -f Makefile.gpu pw-gpu -$ cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-scalapack.x +make -f Makefile.gpu distclean +cd GPU/ +./configure --enable-parallel --enable-openmp --enable-cuda --with-gpu-arch=35 --with-cuda-dir=${CRAY_CUDATOOLKIT_DIR} --without-magma --with-phigemm --with-scalapack --with-elpa ARCH=crayxt +cd ../ +make -f Makefile.gpu pw-gpu +cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-elpa.x + + +IMPORTANT NOTE (1): in case of PGI compiler, it is possible to compile a light version + of the executable by avoiding CRAY wrappers. Look at the file + "" as example. + +IMPORTANT NOTE (2): On TITAN the PGI compiler is the default right after login. If + Intel is the preferred choice, do + $ module switch PrgEnv-pgi PrgEnv-intel + immediately after the login. + + + +3. Running (using TORQUE/PBS Pro) + +Refer to this link + https://www.olcf.ornl.gov/kb_articles/spider-the-center-wide-lustre-file-system/ +about the filesystem management and availability + +IMPORTANT NOTE (1): +The per node charging factor changed from 16 to 30 to reflect the availability +of GPU/Accelerators. Job utilization is now calculated via the formula: + 30 * wall-time * number of requested nodes + +IMPORTANT NOTE (2): +Project granted at ORNL usually have 3 letters (XXX) and three digits (YYY) + + +#!/bin/csh +# +# Example requesting 8 nodes (64 cores in total in SINGLE STREAM MODE +# using 8 OpenMP thread per MPI), 1 MPI process per node (8 in total) +# dedicating the full NVIDIA K20x resource to a single MPI process. +# +#PBS -A +#PBS -N QE-BENCH +#PBS -j oe +#PBS -l walltime=1:00:00,nodes=8 + +cd $PBS_O_WORKDIR + +# Enable _only_ if '-N' > 1 (see below) +#setenv CRAY_CUDA_PROXY 1 + +# DEBUG +#setenv MPICH_ENV_DISPLAY 1 + +# _REMEMBER_ +# '-n' : number of PEs or total MPI processes +# '-d' : number OpenMP thread per node +# '-N' : number of MPI processes per node +# '-j' : + +setenv OMP_NUM_THREADS 8 +aprun -N 1 -n 8 -j 1 -d 8 -cc numa_node ./pw-mpi-omp-gpu.x -input ausurf_gamma.in | tee out.GPU.1-PER-NODE.$PBS_JOBID.v1 -$ make -f Makefile.gpu distclean -$ cd GPU/ -$ ./configure --enable-parallel --enable-openmp --enable-cuda --with-gpu-arch=35 --with-cuda-dir=${CRAY_CUDATOOLKIT_DIR} --disable-magma --enable-phigemm --with-scalapack --disable-profiling --with-elpa ARCH=crayxt -$ cd ../ -$ make -f Makefile.gpu pw-gpu -$ cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-elpa.x -$ make -f Makefile.gpu distclean -$ cd GPU/ -$ ./configure --enable-parallel --enable-openmp --enable-cuda --with-gpu-arch=35 --with-cuda-dir=${CRAY_CUDATOOLKIT_DIR} --enable-magma --enable-phigemm --without-scalapack --disable-profiling --with-internal-cblas ARCH=crayxt -$ cd ../ -$ make -f Makefile.gpu pw-gpu -$ cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-magma.x - 2. Good practices - Each NVIDIA Tesla K20 GPU has 6 GB of memory on the card. Better to limit @@ -108,91 +148,11 @@ $ cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-magma.x - In order to share the GPU between multiple MPI processes within the node is mandatory to export the variable CRAY_CUDA_PROXY ("export CRAY_CUDA_PROXY=1") -- compiling with hugepage support does not produce big benefits, need +- compiling with huge-pages support does not produce big benefits, need more testing... -3 Example scripts (CSCS, SLURM) -3.1 TODI (SLURM) - -#!/bin/bash - -# Example requesting 4 nodes (32 cores in total in SINGLE STREAM MODE -# using 4 OpenMP thread per MPI), 2 MPI process per node (8 in total) -# sharing the NVIDIA K20x among them. - -#SBATCH --job-name="QE-TEST" -#SBATCH --nodes=4 -#SBATCH --time=00:25:00 -#SBATCH --output=QE-BENCH.%j.o -#SBATCH --error=QE-BENCH.%j.e -#SBATCH --account=<...> - -export CRAY_CUDA_PROXY=1 - -#export MALLOC_MMAP_MAX_=0 -#export MALLOC_TRIM_THRESHOLD_=536870912 - -export MPICH_VERSION_DISPLAY=1 -export MPICH_ENV_DISPLAY=1 -export MPICH_CPUMASK_DISPLAY=1 - -# REMEMBER... -# '-n' : number of PEs or total MPI processes -# '-d' : number OpenMP thread per node - -#export OMP_NUM_THREADS=8 -#aprun -n 4 -j 1 -d 8 ./pw-mpi-omp-gpu.x -input ausurf_gamma.in | tee out.GPU.1-PER-NODE.$SLURM_JOB_ID.v1 - -export OMP_NUM_THREADS=4 -aprun -n 8 -j 1 -d 4 ./pw-mpi-omp-gpu.x -input ausurf_gamma.in | tee out.GPU.2-PER-NODE.$SLURM_JOB_ID.v1 - -#export OMP_NUM_THREADS=2 -#aprun -n 16 -j 1 -d 2 ./pw-mpi-omp-gpu.x -input ausurf_gamma.in | tee out.GPU.4-PER-NODE.$SLURM_JOB_ID.v1 - - -3.2 TITAN (PBS Pro) - -# IMPORTANT NOTE (1): -The per node charging factor changed from 16 to 30 to reflect the availability -of GPU/Accelerators. Job utilization is now calculated via the formula: - 30 * walltime * number of requested nodes - -# IMPORTANT NOTE (2): -Project granted at ORNL usually have 3 letters (XXX) and three digits (YYY) - -#!/bin/csh - -# Example requesting 8 nodes (64 cores in total in SINGLE STREAM MODE -# using 8 OpenMP thread per MPI), 1 MPI process per node (8 in total) -# dedicating the full NVIDIA K20x resource to a single MPI process. - -#PBS -A -#PBS -N QE-BENCH -#PBS -j oe -#PBS -l walltime=1:00:00,nodes=8 - -cd $PBS_O_WORKDIR - - -setenv CRAY_CUDA_PROXY 1 -setenv MPICH_ENV_DISPLAY 1 - -# REMEMBER... -# '-n' : number of PEs or total MPI processes -# '-d' : number OpenMP thread per node - -setenv OMP_NUM_THREADS 8 -aprun -n 8 -j 1 -d 8 ./pw-mpi-omp-gpu.x -input ausurf_gamma.in | tee out.GPU.1-PER-NODE.$PBS_JOBID.v1 - - -# IMPORTANT NOTE: -refer to this link - https://www.olcf.ornl.gov/kb_articles/spider-the-center-wide-lustre-file-system/ -about the fileystem management and availability - - -4. Benchmarks +5. Benchmarks [TO BE ADDED] \ No newline at end of file