diff --git a/Tools/machines/greatlakes-umich/greatlakes_v100.sbatch b/Tools/machines/greatlakes-umich/greatlakes_v100.sbatch index 0353c08456f..4814c439dd9 100644 --- a/Tools/machines/greatlakes-umich/greatlakes_v100.sbatch +++ b/Tools/machines/greatlakes-umich/greatlakes_v100.sbatch @@ -26,8 +26,7 @@ INPUTS=inputs # per node are 2x 2.4 GHz Intel Xeon Gold 6148 # note: the system seems to only expose cores (20 per socket), # not hyperthreads (40 per socket) -export SRUN_CPUS_PER_TASK=20 -export OMP_NUM_THREADS=${SRUN_CPUS_PER_TASK} +export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK} # GPU-aware MPI optimizations GPU_AWARE_MPI="amrex.use_gpu_aware_mpi=1" diff --git a/Tools/machines/karolina-it4i/karolina_gpu.sbatch b/Tools/machines/karolina-it4i/karolina_gpu.sbatch index 6171ff03abc..ccb4f3dc2c3 100644 --- a/Tools/machines/karolina-it4i/karolina_gpu.sbatch +++ b/Tools/machines/karolina-it4i/karolina_gpu.sbatch @@ -25,13 +25,12 @@ #SBATCH -o stdout_%j #SBATCH -e stderr_%j -# OpenMP threads per MPI rank -export OMP_NUM_THREADS=16 -export SRUN_CPUS_PER_TASK=16 - # set user rights to u=rwx;g=r-x;o=--- umask 0027 +# OpenMP threads per MPI rank +export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK} + # executable & inputs file or python interpreter & PICMI script here EXE=./warpx.rz INPUTS=./inputs_rz diff --git a/Tools/machines/lonestar6-tacc/lonestar6_a100.sbatch b/Tools/machines/lonestar6-tacc/lonestar6_a100.sbatch index bef40942ed6..933f21093a2 100644 --- a/Tools/machines/lonestar6-tacc/lonestar6_a100.sbatch +++ b/Tools/machines/lonestar6-tacc/lonestar6_a100.sbatch @@ -14,6 +14,7 @@ #SBATCH -q regular #SBATCH -C gpu #SBATCH --exclusive +#SBATCH --cpus-per-task=32 #SBATCH --gpu-bind=none #SBATCH --gpus-per-node=4 #SBATCH -o WarpX.o%j @@ -27,7 +28,7 @@ INPUTS=inputs_small export MPICH_OFI_NIC_POLICY=GPU # threads for OpenMP and threaded compressors per MPI rank -export SRUN_CPUS_PER_TASK=32 +export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK} # depends on https://github.com/ECP-WarpX/WarpX/issues/2009 #GPU_AWARE_MPI="amrex.the_arena_is_managed=0 amrex.use_gpu_aware_mpi=1" diff --git a/Tools/machines/perlmutter-nersc/perlmutter_cpu.sbatch b/Tools/machines/perlmutter-nersc/perlmutter_cpu.sbatch index d13c7e3b4e5..84e93dbb8ea 100644 --- a/Tools/machines/perlmutter-nersc/perlmutter_cpu.sbatch +++ b/Tools/machines/perlmutter-nersc/perlmutter_cpu.sbatch @@ -13,6 +13,8 @@ #SBATCH -A #SBATCH -q regular #SBATCH -C cpu +# 8 cores per chiplet, 2x SMP +#SBATCH --cpus-per-task=16 #SBATCH --ntasks-per-node=16 #SBATCH --exclusive #SBATCH -o WarpX.o%j @@ -30,10 +32,9 @@ INPUTS=inputs_small # This will be our MPI rank assignment (2x8 is 16 ranks/node). # threads for OpenMP and threaded compressors per MPI rank -export SRUN_CPUS_PER_TASK=16 # 8 cores per chiplet, 2x SMP export OMP_PLACES=threads export OMP_PROC_BIND=spread -export OMP_NUM_THREADS=${SRUN_CPUS_PER_TASK} +export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK} srun --cpu-bind=cores \ ${EXE} ${INPUTS} \ diff --git a/Tools/machines/perlmutter-nersc/perlmutter_gpu.sbatch b/Tools/machines/perlmutter-nersc/perlmutter_gpu.sbatch index f2ea5fa3e7f..37bd5d60c54 100644 --- a/Tools/machines/perlmutter-nersc/perlmutter_gpu.sbatch +++ b/Tools/machines/perlmutter-nersc/perlmutter_gpu.sbatch @@ -17,6 +17,7 @@ # A100 80GB (256 nodes) #S BATCH -C gpu&hbm80g #SBATCH --exclusive +#SBATCH --cpus-per-task=16 # ideally single:1, but NERSC cgroups issue #SBATCH --gpu-bind=none #SBATCH --ntasks-per-node=4 @@ -33,8 +34,7 @@ export MPICH_OFI_NIC_POLICY=GPU # threads for OpenMP and threaded compressors per MPI rank # note: 16 avoids hyperthreading (32 virtual cores, 16 physical) -export SRUN_CPUS_PER_TASK=16 -export OMP_NUM_THREADS=${SRUN_CPUS_PER_TASK} +export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK} # GPU-aware MPI optimizations GPU_AWARE_MPI="amrex.use_gpu_aware_mpi=1" diff --git a/Tools/machines/tioga-llnl/tioga_mi300a.sbatch b/Tools/machines/tioga-llnl/tioga_mi300a.sbatch index 0e29e24adcb..94ee97bc6a1 100644 --- a/Tools/machines/tioga-llnl/tioga_mi300a.sbatch +++ b/Tools/machines/tioga-llnl/tioga_mi300a.sbatch @@ -12,6 +12,7 @@ #SBATCH -J WarpX #S BATCH -A # project name not needed yet #SBATCH -p mi300a +#SBATCH --cpus-per-task=16 #SBATCH --gpu-bind=none #SBATCH --ntasks-per-node=4 #SBATCH --gpus-per-node=4 @@ -27,8 +28,7 @@ export MPICH_OFI_NIC_POLICY=GPU # threads for OpenMP and threaded compressors per MPI rank # note: 16 avoids hyperthreading (32 virtual cores, 16 physical) -export SRUN_CPUS_PER_TASK=16 -export OMP_NUM_THREADS=${SRUN_CPUS_PER_TASK} +export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK} # GPU-aware MPI optimizations GPU_AWARE_MPI="amrex.use_gpu_aware_mpi=1"