mirror of https://gitlab.com/QEF/q-e.git
89 lines
1.9 KiB
Plaintext
89 lines
1.9 KiB
Plaintext
|
Info by F. Spiga (spiga -dot- filippo -at- gmail -dot- com) -- Jun 19, 2013
|
||
|
|
||
|
*** THIS IS A DRAFT ***
|
||
|
|
||
|
Machine name : Piz Daint at CSCS (CH)
|
||
|
Machine spec : http://user.cscs.ch/hardware/piz_daint_cray_xc30/index.html
|
||
|
|
||
|
|
||
|
0. Architecture peculiarities
|
||
|
|
||
|
[TO BE ADDED]
|
||
|
|
||
|
|
||
|
1. Compile the code
|
||
|
|
||
|
Up to now, extensive tests proven that Intel compiler is the best choice now
|
||
|
to exploit GPU capabilities of QE and QE-GPU on CRAY XC30 nodes.
|
||
|
|
||
|
# NOTE:
|
||
|
Despite the selected compiler is Intel, xt-libsci is used.
|
||
|
|
||
|
Intel:
|
||
|
- compile:
|
||
|
- CPU execution:
|
||
|
- GPU execution without CRAY_CUDA_PROXY:
|
||
|
- GPU execution with CRAY_CUDA_PROXY:
|
||
|
|
||
|
PGI, CRAY:
|
||
|
- compile:
|
||
|
- CPU execution:
|
||
|
- GPU execution without CRAY_CUDA_PROXY:
|
||
|
- GPU execution with CRAY_CUDA_PROXY:
|
||
|
|
||
|
GNU:
|
||
|
|
||
|
|
||
|
1.2 Piz Daint (CSCS) :
|
||
|
|
||
|
[TO BE ADDED]
|
||
|
|
||
|
|
||
|
2. Good practices
|
||
|
|
||
|
[TO BE ADDED]
|
||
|
|
||
|
|
||
|
3 Example scripts (CSCS, SLURM)
|
||
|
|
||
|
3.1 Piz Daint (SLURM)
|
||
|
|
||
|
#!/bin/bash
|
||
|
|
||
|
# Example requesting 4 nodes (32 cores in total in SINGLE STREAM MODE
|
||
|
# using 4 OpenMP thread per MPI), 2 MPI process per node (8 in total)
|
||
|
# sharing the NVIDIA K20x among them.
|
||
|
|
||
|
#SBATCH --job-name="QE-TEST"
|
||
|
#SBATCH --nodes=4
|
||
|
#SBATCH --time=00:25:00
|
||
|
#SBATCH --output=QE-BENCH.%j.o
|
||
|
#SBATCH --error=QE-BENCH.%j.e
|
||
|
#SBATCH --account=<...>
|
||
|
|
||
|
export CRAY_CUDA_PROXY=1
|
||
|
|
||
|
#export MALLOC_MMAP_MAX_=0
|
||
|
#export MALLOC_TRIM_THRESHOLD_=536870912
|
||
|
|
||
|
export MPICH_VERSION_DISPLAY=1
|
||
|
export MPICH_ENV_DISPLAY=1
|
||
|
export MPICH_CPUMASK_DISPLAY=1
|
||
|
|
||
|
# REMEMBER...
|
||
|
# '-n' : number of PEs or total MPI processes
|
||
|
# '-d' : number OpenMP thread per node
|
||
|
|
||
|
#export OMP_NUM_THREADS=8
|
||
|
#aprun -n 4 -j 1 -d 8 ./pw-mpi-omp-gpu.x -input ausurf_gamma.in | tee out.GPU.1-PER-NODE.$SLURM_JOB_ID.v1
|
||
|
|
||
|
export OMP_NUM_THREADS=4
|
||
|
aprun -n 8 -j 1 -d 4 ./pw-mpi-omp-gpu.x -input ausurf_gamma.in | tee out.GPU.2-PER-NODE.$SLURM_JOB_ID.v1
|
||
|
|
||
|
#export OMP_NUM_THREADS=2
|
||
|
#aprun -n 16 -j 1 -d 2 ./pw-mpi-omp-gpu.x -input ausurf_gamma.in | tee out.GPU.4-PER-NODE.$SLURM_JOB_ID.v1
|
||
|
|
||
|
|
||
|
4. Benchmarks
|
||
|
|
||
|
[TO BE ADDED]
|