Build scripts and nexus machine config update for archer2 (#3999)

* Modified nexus/lib/physical_system.py to read poscar files with a different name, specifying "format"

* machine archer2

* config for archer2

* corrected error in nu. cpus per node

* config archer2 updates, right mpi libs

* config for archer2
https://docs.archer2.ac.uk/user-guide/hardware/

* more efficient mpi moduli in Archer2

* build script for archer2, some modificationsWq

* update config/build_archer2.sh

* undo change to "physical_system.py"

* changes to machines and test_machines for Archer2

* fix error test on archer2 machine

Co-authored-by: Andrea Zen <andrea.zen@unina.it>
Co-authored-by: Ye Luo <yeluo@anl.gov>
This commit is contained in:
Andrea Zen 2022-05-09 03:43:03 +02:00 committed by GitHub
parent e6e5c80c9b
commit c7fabc875b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 186 additions and 4 deletions

63
config/build_archer2.sh Executable file
View File

@ -0,0 +1,63 @@
#!/bin/bash
echo "ARCHER2: Information on hardware and software"
echo "https://www.archer2.ac.uk/about/hardware.html"
echo "and documentation:"
echo "https://docs.archer2.ac.uk"
echo
echo "Loading QMCPACK dependency modules for archer2"
echo
module restore
module load PrgEnv-gnu
module load cray-hdf5-parallel
module load cray-fftw
export FFTW_ROOT=$FFTW_DIR/..
module load libxml2
module load cmake
module load boost
module load cray-python
echo
echo "Loaded moduli:"
module list
echo
echo "In the running scipt (but not in compilation) also load the following two modules:"
echo " module load craype-network-ucx"
echo " module load cray-mpich-ucx"
echo "which improves a lot the scaling efficiency. "
echo
echo
declare -A builds=( ["cpu"]="-DBUILD_PPCONVERT=1" \
["complex_cpu"]="-DQMC_COMPLEX=1" \
)
mkdir bin
for build in "${!builds[@]}"
do
echo "building: $build with ${builds[$build]}"
rm bin/qmcpack_${build}
mkdir build_${build}
cd build_${build}
cmake -DCMAKE_C_COMPILER="cc" \
-DCMAKE_CXX_COMPILER="CC" \
-DCMAKE_SYSTEM_NAME=CrayLinuxEnvironment \
-D LibXml2_ROOT=$LIBXML2_ROOT \
-DBUILD_LMYENGINE_INTERFACE=0 \
${builds[$build]} \
..
make -j 20
if [ $? -eq 0 ]; then
build_dir=$(pwd)
if [ -e ${build_dir}/bin/qmcpack_complex ]; then
ln -sf ${build_dir}/bin/qmcpack_complex ${build_dir}/../bin/qmcpack_${build}
else
ln -sf ${build_dir}/bin/qmcpack ${build_dir}/../bin/qmcpack_${build}
fi
fi
cd ..
done

View File

@ -3160,10 +3160,6 @@ class Andes(Supercomputer):
errfile_extension = '.error'
def post_process_job(self,job):
job.run_options.add(
N='-N {}'.format(job.nodes),
n='-n {}'.format(job.processes),
)
if job.threads>1:
job.run_options.add(
c = '-c {}'.format(job.threads),
@ -3179,6 +3175,10 @@ class Andes(Supercomputer):
)
#end if
#end if
job.run_options.add(
N='-N {}'.format(job.nodes),
n='-n {}'.format(job.processes),
)
#end def post_process_job
def write_job_header(self,job):
@ -3225,6 +3225,97 @@ class Andes(Supercomputer):
#end class Andes
## Added 05/04/2022 by A Zen
class Archer2(Supercomputer):
# https://docs.archer2.ac.uk/user-guide/hardware/
name = 'archer2'
requires_account = True
batch_capable = True
#executable_subfile = True
prefixed_output = True
outfile_extension = '.output'
errfile_extension = '.error'
def post_process_job(self,job):
job.run_options.add(
distribution='--distribution=block:block',
hint='--hint=nomultithread',
N='-N {}'.format(job.nodes),
n='-n {}'.format(job.processes),
)
if job.threads>1:
job.run_options.add(
c = '-c {}'.format(job.threads),
)
# if 'cpu_bind' not in job.run_options:
# if job.processes_per_node==self.cores_per_node:
# cpu_bind = '--cpu-bind=threads'
# else:
# cpu_bind = '--cpu-bind=cores'
# #end if
# job.run_options.add(
# cpu_bind = cpu_bind
# )
#end if
#end if
#end def post_process_job
def write_job_header(self,job):
if job.qos is None:
job.qos='standard'
#end if
base_partition = None
if job.qos == 'long':
max_time = 48
max_partition = 64
elif 'short' in job.qos:
max_time = 20.0/60.0
max_partition = 32
else:
max_time = 24
max_partition = 1024
#end if
job.total_hours = job.days*24 + job.hours + job.minutes/60.0 + job.seconds/3600.0
if job.total_hours > max_time:
self.warn('!!! ATTENTION !!!\n the maximum runtime on {0} should not be more than {1}\n you requested: {2}'.format(job.queue,max_time,job.total_hours))
job.hours = max_time
job.minutes =0
job.seconds =0
#end if
if job.nodes > max_partition:
self.warn('!!! ATTENTION !!!\n the maximum nodes on {0} should not be more than {1}\n you requested: {2}'.format(job.queue,max_partition,job.nodes))
job.nodes = max_partition
#end if
c='#!/bin/bash\n'
c+='#SBATCH --job-name '+str(job.name)+'\n'
c+='#SBATCH --account='+str(job.account)+'\n'
c+='#SBATCH -N '+str(job.nodes)+'\n'
c+='#SBATCH --ntasks-per-node={0}\n'.format(job.processes_per_node)
c+='#SBATCH --cpus-per-task={0}\n'.format(job.threads)
c+='#SBATCH -t {0}:{1}:{2}\n'.format(str(job.hours+24*job.days).zfill(2),str(job.minutes).zfill(2),str(job.seconds).zfill(2))
c+='#SBATCH -o {0}\n'.format(job.outfile)
c+='#SBATCH -e {0}\n'.format(job.errfile)
c+='#SBATCH --partition=standard\n'
c+='#SBATCH --qos={0}\n'.format(job.qos)
if job.email is not None:
c+='#SBATCH --mail-user {}\n'.format(job.email)
c+='#SBATCH --mail-type ALL\n'
#c+='#SBATCH --mail-type FAIL\n'
#end if
c+='\n'
#c+='cd $SLURM_SUBMIT_DIR\n'
#c+='\n'
c+='echo JobID : $SLURM_JOBID\n'
c+='echo Number of nodes requested: $SLURM_JOB_NUM_NODES\n'
c+='echo List of nodes assigned to the job: $SLURM_NODELIST\n'
c+='\n'
return c
#end def write_job_header
#end class Archer2
class Tomcat3(Supercomputer):
name = 'tomcat3'
requires_account = False
@ -3297,6 +3388,7 @@ Rhea( 512, 2, 8, 128, 1000, 'srun', 'sbatch', 'squeue', 'sc
Andes( 704, 2, 16, 256, 1000, 'srun', 'sbatch', 'squeue', 'scancel')
Tomcat3( 8, 1, 64, 192, 1000, 'mpirun', 'sbatch', 'sacct', 'scancel')
SuperMUC_NG( 6336, 1, 48, 96, 1000,'mpiexec', 'sbatch', 'sacct', 'scancel')
Archer2( 5860, 2, 64, 512, 1000, 'srun', 'sbatch', 'squeue', 'scancel')
#machine accessor functions

View File

@ -1033,6 +1033,12 @@ def test_job_run_command():
('andes' , 'n2_t2' ) : 'srun -N 2 -c 2 --cpu-bind=cores -n 32 test.x',
('andes' , 'n2_t2_e' ) : 'srun -N 2 -c 2 --cpu-bind=cores -n 32 test.x',
('andes' , 'n2_t2_p2' ) : 'srun -N 2 -c 2 --cpu-bind=cores -n 4 test.x',
('archer2' , 'n1' ) : 'srun --distribution=block:block --hint=nomultithread -N 1 -n 128 test.x',
('archer2' , 'n1_p1' ) : 'srun --distribution=block:block --hint=nomultithread -N 1 -n 1 test.x',
('archer2' , 'n2' ) : 'srun --distribution=block:block --hint=nomultithread -N 2 -n 256 test.x',
('archer2' , 'n2_t2' ) : 'srun --distribution=block:block --hint=nomultithread -N 2 -c 2 -n 128 test.x',
('archer2' , 'n2_t2_e' ) : 'srun --distribution=block:block --hint=nomultithread -N 2 -c 2 -n 128 test.x',
('archer2' , 'n2_t2_p2' ) : 'srun --distribution=block:block --hint=nomultithread -N 2 -c 2 -n 4 test.x',
('attaway' , 'n1' ) : 'srun test.x',
('attaway' , 'n1_p1' ) : 'srun test.x',
('attaway' , 'n2' ) : 'srun test.x',
@ -1382,6 +1388,26 @@ echo List of nodes assigned to the job: $SLURM_NODELIST
export ENV_VAR=1
export OMP_NUM_THREADS=1
srun -N 2 -n 64 test.x''',
archer2 = '''#!/bin/bash
#SBATCH --job-name jobname
#SBATCH --account=ABC123
#SBATCH -N 2
#SBATCH --ntasks-per-node=128
#SBATCH --cpus-per-task=1
#SBATCH -t 06:30:00
#SBATCH -o test.out
#SBATCH -e test.err
#SBATCH --partition=standard
#SBATCH --qos=standard
echo JobID : $SLURM_JOBID
echo Number of nodes requested: $SLURM_JOB_NUM_NODES
echo List of nodes assigned to the job: $SLURM_NODELIST
export ENV_VAR=1
export OMP_NUM_THREADS=1
srun --distribution=block:block --hint=nomultithread -N 2 -n 256 test.x''',
attaway = '''#!/bin/bash
#SBATCH -p batch
#SBATCH --job-name jobname
@ -1893,6 +1919,7 @@ runjob --np 32 -p 16 $LOCARGS --verbose=INFO --envs OMP_NUM_THREADS=1 ENV_VAR=1
def job_files_same(jf1,jf2):
jf1 = process_job_file(jf1)
jf2 = process_job_file(jf2)
if not object_eq(jf1,jf2): print(f"compare --------------------\n * wj *\n{jf1}\n * ref_wj *\n{jf2}\n")
return object_eq(jf1,jf2)
#end def job_files_same