Compile and run QE-GPU on CRAY XK7 using PGI compilers -- updates

git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@10459 c92efa57-630b-4861-b058-cf58834340f0
2013-09-18 21:46:47 +00:00 · 2013-09-18 21:46:47 +00:00 · c374d873d6
parent 6d6e8ccaff
commit c374d873d6
2 changed files with 256 additions and 124 deletions
--- a/install/Make.CRAY-XK7
+++ b/install/Make.CRAY-XK7
@ -0,0 +1,172 @@
+# make.sys.  Generated from make.sys.in by configure.
+
+# compilation rules
+
+.SUFFIXES :
+.SUFFIXES : .o .c .f .f90 .cu
+
+# most fortran compilers can directly preprocess c-like directives: use
+# 	$(MPIF90) $(F90FLAGS) -c $<
+# if explicit preprocessing by the C preprocessor is needed, use:
+# 	$(CPP) $(CPPFLAGS) $< -o $*.F90 
+#	$(MPIF90) $(F90FLAGS) -c $*.F90 -o $*.o
+# remember the tabulator in the first column !!!
+
+.f90.o:
+	$(MPIF90) $(F90FLAGS) -c $<
+
+# .f.o and .c.o: do not modify
+
+.f.o:
+	$(F77) $(FFLAGS) -c $<
+
+.c.o:
+	$(CC) $(CFLAGS)  -c $<
+
+# CUDA files
+.cu.o:
+	$(NVCC) $(NVCCFLAGS) -I../../include $(IFLAGS) $(DFLAGS)   -c $<
+
+# topdir for linking espresso libs with plugins
+TOPDIR = /ccs/home/spigafi/espresso/GPU/../
+
+
+# DFLAGS  = precompilation options (possible arguments to -D and -U)
+#           used by the C compiler and preprocessor
+# FDFLAGS = as DFLAGS, for the f90 compiler
+# See include/defs.h.README for a list of options and their meaning
+# With the exception of IBM xlf, FDFLAGS = $(DFLAGS)
+# For IBM xlf, FDFLAGS is the same as DFLAGS with separating commas 
+
+# MANUAL_DFLAGS  = additional precompilation option(s), if desired
+#                  You may use this instead of tweaking DFLAGS and FDFLAGS
+#                  BEWARE: will not work for IBM xlf! Manually edit FDFLAGS
+MANUAL_DFLAGS  =
+DFLAGS         =  -D__PGI -D__IOTK_SAFEST -D__FFTW -D__MPI -D__PARA -D__SCALAPACK -D__CUDA -D__OPENMP -D__ELPA -D__PHIGEMM $(MANUAL_DFLAGS)
+FDFLAGS        = $(DFLAGS) $(MANUAL_DFLAGS)
+
+# IFLAGS = how to locate directories where files to be included are
+# In most cases, IFLAGS = -I../include
+
+IFLAGS         = -I$(MPICH_DIR)/include -I../include -I/ccs/home/spigafi/espresso/GPU/..//phiGEMM/include $(CRAY_CUDATOOLKIT_INCLUDE_OPTS)
+
+# MOD_FLAGS = flag used by f90 compiler to locate modules
+# Each Makefile defines the list of needed modules in MODFLAGS
+
+MOD_FLAG      = -I
+
+# Compilers: fortran-90, fortran-77, C
+# If a parallel compilation is desired, MPIF90 should be a fortran-90 
+# compiler that produces executables for parallel execution using MPI
+# (such as for instance mpif90, mpf90, mpxlf90,...);
+# otherwise, an ordinary fortran-90 compiler (f90, g95, xlf90, ifort,...)
+# If you have a parallel machine but no suitable candidate for MPIF90,
+# try to specify the directory containing "mpif.h" in IFLAGS
+# and to specify the location of MPI libraries in MPI_LIBS
+
+MPIF90         = pgf90
+#F90           = pgf90
+CC             = pgcc
+F77            = pgf77
+
+# C preprocessor and preprocessing flags - for explicit preprocessing, 
+# if needed (see the compilation rules above)
+# preprocessing flags must include DFLAGS and IFLAGS
+
+CPP            = cpp
+CPPFLAGS       = -P -traditional $(DFLAGS) $(IFLAGS)
+
+# compiler flags: C, F90, F77
+# C flags must include DFLAGS and IFLAGS
+# F90 flags must include MODFLAGS, IFLAGS, and FDFLAGS with appropriate syntax
+
+CFLAGS         = -O3 -D__align__\(n\)=__attribute__\(\(aligned\(n\)\)\) -D__location__\(a\)=__annotate__\(a\)  -DCUDARTAPI=  -D__x86_64 $(DFLAGS) $(IFLAGS)
+F90FLAGS       = -O3 -Mcache_align -r8 -Mpreprocess -mp=nonuma $(FDFLAGS) $(IFLAGS) $(MODFLAGS)
+FFLAGS         = -O3 -r8 -mp=nonuma 
+
+# compiler flags without optimization for fortran-77
+# the latter is NEEDED to properly compile dlamch.f, used by lapack
+
+FFLAGS_NOOPT   = -O0
+
+# compiler flag needed by some compilers when the main is not fortran
+# Currently used for Yambo
+
+FFLAGS_NOMAIN   = -Mnomain
+
+# Linker, linker-specific flags (if any)
+# Typically LD coincides with F90 or MPIF90, LD_LIBS is empty
+
+LD             = pgf90
+LDFLAGS        = -v -mp=nonuma
+LD_LIBS        = $(MPICH_DIR)/lib/libmpich_pgi.so /opt/cray/libsci/12.0.00/pgi/119/interlagos/lib/libsci_pgi_mp.a  $(CRAY_CUDATOOLKIT_POST_LINK_OPTS)  -lcublas  -lcufft
+
+# External Libraries (if any) : blas, lapack, fft, MPI
+
+# If you have nothing better, use the local copy :
+# BLAS_LIBS = /your/path/to/espresso/BLAS/blas.a
+# BLAS_LIBS_SWITCH = internal
+
+BLAS_LIBS      = /ccs/home/spigafi/espresso/GPU/..//phiGEMM/lib/libphigemm.a   
+BLAS_LIBS_SWITCH = external
+
+# OpenBLAS is used to exploit multi-core CPU if a multi-threaded BLAS
+# is not used or installed in the system (i.e. MKL is missing)
+OPENBLAS_INTERNAL = 0
+
+# If you have nothing better, use the local copy :
+# LAPACK_LIBS = /your/path/to/espresso/lapack-3.2/lapack.a
+# LAPACK_LIBS_SWITCH = internal
+# For IBM machines with essl (-D__ESSL): load essl BEFORE lapack !
+# remember that LAPACK_LIBS precedes BLAS_LIBS in loading order
+
+# CBLAS is used in case the C interface for BLAS is missing (i.e. ACML)
+CBLAS_ENABLED = 0
+
+LAPACK_LIBS    =  
+LAPACK_LIBS_SWITCH = external
+
+ELPA_LIBS_SWITCH = enabled
+SCALAPACK_LIBS = $(TOPDIR)/ELPA/libelpa.a 
+
+# nothing needed here if the the internal copy of FFTW is compiled
+# (needs -D__FFTW in DFLAGS)
+
+FFT_LIBS       = 
+
+# For parallel execution, the correct path to MPI libraries must
+# be specified in MPI_LIBS (except for IBM if you use mpxlf)
+
+MPI_LIBS       = 
+
+# IBM-specific: MASS libraries, if available and if -D__MASS is defined in FDFLAGS
+
+MASS_LIBS      = 
+
+# ar command and flags - for most architectures: AR = ar, ARFLAGS = ruv
+
+AR             = ar
+ARFLAGS        = ruv
+
+# ranlib command. If ranlib is not needed (it isn't in most cases) use
+# RANLIB = echo
+
+RANLIB         = ranlib
+
+# all internal and external libraries - do not modify
+
+FLIB_TARGETS   = all
+
+# CUDA section
+NVCC             = /opt/nvidia/cudatoolkit/5.0.35.102/bin/nvcc
+NVCCFLAGS        = -O3 -gencode arch=compute_30,code=sm_35  
+
+PHIGEMM_INTERNAL = 1
+PHIGEMM_SYMBOLS  = 1
+MAGMA_INTERNAL   = 0
+
+LIBOBJS        = ../flib/ptools.a ../flib/flib.a ../clib/clib.a ../iotk/src/libiotk.a 
+LIBS           = $(SCALAPACK_LIBS) $(LAPACK_LIBS) $(FFT_LIBS) $(BLAS_LIBS) $(MPI_LIBS) $(MASS_LIBS) $(LD_LIBS)
+
+# wget or curl - useful to download from network
+WGET = wget -O
--- a/install/README.CRAY-XK7
+++ b/install/README.CRAY-XK7
@ -1,7 +1,4 @@
-Info by F. Spiga (spiga -dot- filippo -at- gmail -dot-  com) -- Jun 19, 2013
-
-Machine name    : TODI at CSCS (CH)
-Machine spec    : http://user.cscs.ch/hardware/todi_cray_xk7/index.html
+Info by Filippo Spiga  -- Sept 18, 2013

 Machine name    : TITAN at Oak Ridge National laboratory (USA)
 Machine spec    : https://www.olcf.ornl.gov/computing-resources/titan-cray-xk7/
@ -10,7 +7,8 @@ Machine spec    : https://www.olcf.ornl.gov/computing-resources/titan-cray-xk7/
 Other CRAY XK7 systems might have different modules, please check for 
 equivalent if the ones mentioned are missing

- 
+
+
 0. Architecture peculiarities

 CRAY XK7 systems currently in operation are equipped with one AMD 16-core
@ -34,7 +32,9 @@ support a resource manager to identify a subset of free nodes in the cluster
 to minimize hops. Please refer to specific user-guide provided by your HPC
 centre.

-1. Compile the code
+
+
+1. Compile GPU-accelerated PWscf

 Up to now, extensive tests proven that Intel compiler is the best choice now 
 to exploit GPU capabilities of QE-GPU on CRAY XK7 nodes. 
@ -46,56 +46,96 @@ is used.
 Intel:
 - compile: ok
 - CPU execution: ok
- GPU execution without CRAY_CUDA_PROXY: ok
- GPU execution with CRAY_CUDA_PROXY: ok
+- GPU execution: ok

-PGI, CRAY:
+PGI:
 - compile: ok
 - CPU execution: ok
- GPU execution without CRAY_CUDA_PROXY: *KO*
- GPU execution with CRAY_CUDA_PROXY: *KO*
+- GPU execution: ok

 GNU: not tested yet
-
-# NOTE:
-Priority will be given to Intel compiler since future generations of CRAY 
-will be equipped with Intel processors (see CRAY XC30).
        
+_ After login to a system....

-1.2 Titan (ORNL) :
+module load cudatoolkit
+module unload atp totalview-support xt-totalview hss-llm

-PGI is the default after login...
+export FINALDIR=$HOME/whatever
+mkdir -p $FINALDIR

-$ module switch PrgEnv-pgi PrgEnv-intel
-$ module load cudatoolkit
-$ module unload atp totalview-support xt-totalview hss-llm
-$ module load gcc/4.4.4
+make -f Makefile.gpu distclean
+cd GPU/
+./configure --enable-parallel --enable-openmp --enable-cuda --with-gpu-arch=35 --with-cuda-dir=${CRAY_CUDATOOLKIT_DIR} --without-magma --with-phigemm  --with-scalapack ARCH=crayxt
+cd ../
+make -f Makefile.gpu pw-gpu
+cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-scalapack.x

-$ export FINALDIR=$HOME/whatever
-$ mkdir -p $FINALDIR
+_or_ 

-$ make -f Makefile.gpu distclean
-$ cd GPU/
-$ ./configure --enable-parallel --enable-openmp --enable-cuda --with-gpu-arch=35 --with-cuda-dir=${CRAY_CUDATOOLKIT_DIR} --disable-magma --enable-phigemm  --with-scalapack --disable-profiling  ARCH=crayxt
-$ cd ../
-$ make -f Makefile.gpu pw-gpu
-$ cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-scalapack.x
+make -f Makefile.gpu distclean
+cd GPU/
+./configure --enable-parallel --enable-openmp --enable-cuda --with-gpu-arch=35 --with-cuda-dir=${CRAY_CUDATOOLKIT_DIR} --without-magma --with-phigemm  --with-scalapack --with-elpa ARCH=crayxt
+cd ../
+make -f Makefile.gpu pw-gpu
+cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-elpa.x
+
+
+IMPORTANT NOTE (1): in case of PGI compiler, it is possible to compile a light version 
+                    of the executable by avoiding CRAY wrappers. Look at the file 
+                    "" as example.
+
+IMPORTANT NOTE (2): On TITAN the PGI compiler is the default right after login. If
+                    Intel is the preferred choice, do
+                    $ module switch PrgEnv-pgi PrgEnv-intel
+                    immediately after the login.
+
+
+
+3. Running (using TORQUE/PBS Pro)
+
+Refer to this link 
+   https://www.olcf.ornl.gov/kb_articles/spider-the-center-wide-lustre-file-system/
+about the filesystem management and availability
+
+IMPORTANT NOTE (1): 
+The per node charging factor changed from 16 to 30 to reflect the availability
+of GPU/Accelerators. Job utilization is now calculated via the formula: 
+      30 * wall-time * number of requested nodes 
+
+IMPORTANT NOTE (2): 
+Project granted at ORNL usually have 3 letters (XXX) and three digits (YYY)
+
+
+#!/bin/csh
+#
+# Example requesting 8 nodes (64 cores in total in SINGLE STREAM MODE 
+# using 8 OpenMP thread per MPI), 1 MPI process per node (8 in total)
+# dedicating the full NVIDIA K20x resource to a single MPI process.
+#
+#PBS -A <XXXYYY>
+#PBS -N QE-BENCH
+#PBS -j oe
+#PBS -l walltime=1:00:00,nodes=8
+
+cd $PBS_O_WORKDIR
+
+# Enable _only_ if '-N' > 1 (see below)
+#setenv CRAY_CUDA_PROXY 1
+
+# DEBUG
+#setenv MPICH_ENV_DISPLAY 1
+
+# _REMEMBER_
+# '-n' : number of PEs or total MPI processes
+# '-d' : number OpenMP thread per node
+# '-N' : number of MPI processes per node
+# '-j' : 
+ 
+setenv OMP_NUM_THREADS 8
+aprun -N 1 -n 8 -j 1 -d 8 -cc numa_node ./pw-mpi-omp-gpu.x -input ausurf_gamma.in | tee out.GPU.1-PER-NODE.$PBS_JOBID.v1

-$ make -f Makefile.gpu distclean
-$ cd GPU/
-$ ./configure --enable-parallel --enable-openmp --enable-cuda --with-gpu-arch=35 --with-cuda-dir=${CRAY_CUDATOOLKIT_DIR} --disable-magma --enable-phigemm  --with-scalapack --disable-profiling --with-elpa  ARCH=crayxt
-$ cd ../
-$ make -f Makefile.gpu pw-gpu
-$ cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-elpa.x

-$ make -f Makefile.gpu distclean
-$ cd GPU/
-$ ./configure --enable-parallel --enable-openmp --enable-cuda --with-gpu-arch=35 --with-cuda-dir=${CRAY_CUDATOOLKIT_DIR} --enable-magma --enable-phigemm  --without-scalapack --disable-profiling  --with-internal-cblas ARCH=crayxt
-$ cd ../
-$ make -f Makefile.gpu pw-gpu
-$ cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-magma.x

-    
 2. Good practices

 - Each NVIDIA Tesla K20 GPU has 6 GB of memory on the card. Better to limit 
@ -108,91 +148,11 @@ $ cp GPU/PW/pw-gpu.x ${FINALDIR}/pw-mpi-omp-gpu-magma.x
 - In order to share the GPU between multiple MPI processes within the node is 
  mandatory to export the variable CRAY_CUDA_PROXY ("export CRAY_CUDA_PROXY=1")

- compiling with hugepage support does not produce big benefits, need 
+- compiling with huge-pages support does not produce big benefits, need 
  more testing...


-3 Example scripts (CSCS, SLURM)

-3.1 TODI (SLURM)
-
-#!/bin/bash
-
-# Example requesting 4 nodes (32 cores in total in SINGLE STREAM MODE 
-# using 4 OpenMP thread per MPI), 2 MPI process per node (8 in total)
-# sharing the NVIDIA K20x among them.
-
-#SBATCH --job-name="QE-TEST"
-#SBATCH --nodes=4
-#SBATCH --time=00:25:00
-#SBATCH --output=QE-BENCH.%j.o
-#SBATCH --error=QE-BENCH.%j.e
-#SBATCH --account=<...>
-
-export CRAY_CUDA_PROXY=1
-
-#export MALLOC_MMAP_MAX_=0
-#export MALLOC_TRIM_THRESHOLD_=536870912
-
-export MPICH_VERSION_DISPLAY=1
-export MPICH_ENV_DISPLAY=1
-export MPICH_CPUMASK_DISPLAY=1
-
-# REMEMBER...
-# '-n' : number of PEs or total MPI processes
-# '-d' : number OpenMP thread per node
-
-#export OMP_NUM_THREADS=8
-#aprun -n 4 -j 1 -d 8 ./pw-mpi-omp-gpu.x -input ausurf_gamma.in | tee out.GPU.1-PER-NODE.$SLURM_JOB_ID.v1
-
-export OMP_NUM_THREADS=4
-aprun -n 8 -j 1 -d 4 ./pw-mpi-omp-gpu.x -input ausurf_gamma.in | tee out.GPU.2-PER-NODE.$SLURM_JOB_ID.v1
-
-#export OMP_NUM_THREADS=2
-#aprun -n 16 -j 1 -d 2 ./pw-mpi-omp-gpu.x -input ausurf_gamma.in | tee out.GPU.4-PER-NODE.$SLURM_JOB_ID.v1
-
-
-3.2 TITAN (PBS Pro)
-
-# IMPORTANT NOTE (1): 
-The per node charging factor changed from 16 to 30 to reflect the availability
-of GPU/Accelerators. Job utilization is now calculated via the formula: 
-      30 * walltime * number of requested nodes 
-
-# IMPORTANT NOTE (2): 
-Project granted at ORNL usually have 3 letters (XXX) and three digits (YYY)
-
-#!/bin/csh
-
-# Example requesting 8 nodes (64 cores in total in SINGLE STREAM MODE 
-# using 8 OpenMP thread per MPI), 1 MPI process per node (8 in total)
-# dedicating the full NVIDIA K20x resource to a single MPI process.
-
-#PBS -A <XXXYYY>
-#PBS -N QE-BENCH
-#PBS -j oe
-#PBS -l walltime=1:00:00,nodes=8
-
-cd $PBS_O_WORKDIR
-
-
-setenv CRAY_CUDA_PROXY 1
-setenv MPICH_ENV_DISPLAY 1
-
-# REMEMBER...
-# '-n' : number of PEs or total MPI processes
-# '-d' : number OpenMP thread per node
- 
-setenv OMP_NUM_THREADS 8
-aprun -n 8 -j 1 -d 8 ./pw-mpi-omp-gpu.x -input ausurf_gamma.in | tee out.GPU.1-PER-NODE.$PBS_JOBID.v1
-
-
-# IMPORTANT NOTE: 
-refer to this link 
-   https://www.olcf.ornl.gov/kb_articles/spider-the-center-wide-lustre-file-system/
-about the fileystem management and availability
-
-
-4. Benchmarks 
+5. Benchmarks 

 [TO BE ADDED]