From 284faf242de938a8520c8dc54405c5aa961f4ed7 Mon Sep 17 00:00:00 2001 From: pneerincx Date: Tue, 15 Jun 2021 13:59:43 +0200 Subject: [PATCH 01/10] Improved QoS settings (WIP). --- roles/slurm_management/files/slurm.epilog | 10 ++-- roles/slurm_management/files/slurm.prolog | 52 ++++++++++++------- roles/slurm_management/files/slurm.taskprolog | 40 +++++++++----- .../configure_slurm_accounting_db.bash | 35 ++++++++++++- 4 files changed, 98 insertions(+), 39 deletions(-) diff --git a/roles/slurm_management/files/slurm.epilog b/roles/slurm_management/files/slurm.epilog index c07e0abae..090d62847 100644 --- a/roles/slurm_management/files/slurm.epilog +++ b/roles/slurm_management/files/slurm.epilog @@ -1,6 +1,6 @@ #!/bin/bash -if [ -z "${SLURM_JOB_ID}" ]; then +if [[ -z "${SLURM_JOB_ID}" ]]; then logger -s "WARN: SLURM_JOB_ID is empty or unset in SLURM epilog." exit 0 fi @@ -8,8 +8,10 @@ fi # # Cleanup job's private tmp dir. # -TMPDIR="/local/${SLURM_JOB_ID}/" -rm -rf "${TMPDIR}" +TMPDIR="/local/${SLURM_JOB_ID:-missing_SLURM_JOB_ID}/" +if [[ -e "${TMPDIR}" ]]; then + rm -Rf "${TMPDIR}" +fi # # Append resource usage stats to job's *.out file if we have an STDOUT file. @@ -19,7 +21,7 @@ SCONTROL_JOB_INFO="$(scontrol show job ${SLURM_JOB_ID})" SLURM_JOB_STDOUT="$(printf '%s\n' "${SCONTROL_JOB_INFO}" | grep 'StdOut=' | sed 's/[[:space:]]*StdOut=//')" SLURM_JOB_NODE="$(printf '%s\n' "${SCONTROL_JOB_INFO}" | grep 'BatchHost=' | sed 's/[[:space:]]*BatchHost=//')" SLURM_JOB_STDOUT="$(echo "${SLURM_JOB_STDOUT}" | sed "s/%N/${SLURM_JOB_NODE}/")" -if [ -w "${SLURM_JOB_STDOUT}" ]; then +if [[ -w "${SLURM_JOB_STDOUT}" ]]; then sformat='JobId,Elapsed,AllocCPUs,AveCPU,ReqMem,MaxVMSize,MaxRSS,MaxDiskRead,MaxDiskWrite' echo '#################################################################################################################' >> "${SLURM_JOB_STDOUT}" echo '# Job details recorded by SLURM job epilog using sacct. #' >> "${SLURM_JOB_STDOUT}" diff --git a/roles/slurm_management/files/slurm.prolog b/roles/slurm_management/files/slurm.prolog index ec35c9da1..a8223eda4 100644 --- a/roles/slurm_management/files/slurm.prolog +++ b/roles/slurm_management/files/slurm.prolog @@ -1,34 +1,46 @@ #!/bin/bash -# -# Make sure we are successful in making tmp dirs in /local. -# When this failed the job should not continue as SLURM will default to /tmp, -# which is not suitable for heavy random IO nor large data sets. -# Hammering /tmp may effectively result in the node going down. -# When the prolog fails the node will be set to state=DRAIN instead. -# - -if [ -z "${SLURM_JOB_ID}" ]; then +if [[ -z "${SLURM_JOB_ID}" ]]; then logger -s "FATAL: SLURM_JOB_ID is empty or unset in SLURM prolog." exit 1 +elif [[ -z "${SLURM_JOB_QOS}" ]]; then + logger -s "FATAL: SLURM_JOB_QOS is empty or unset in SLURM prolog." + exit 1 #else -# logger -s "DEBUG: Found SLURM_JOB_ID ${SLURM_JOB_ID} in SLURM prolog." +# logger -s "DEBUG: Found SLURM_JOB_ID ${SLURM_JOB_ID} and SLURM_JOB_QOS ${SLURM_JOB_QOS} in SLURM prolog." fi set -e set -u +LOCAL_SCRATCH_DIR='/local' # # Check if local scratch dir is mountpoint and hence not a dir on the system disk. # -LOCAL_SCRATCH_DIR='/local' -if [ $(stat -c '%d' "${LOCAL_SCRATCH_DIR}") -eq $(stat -c '%d' "${LOCAL_SCRATCH_DIR}/..") ]; then - logger -s "FATAL: local scratch disk (${LOCAL_SCRATCH_DIR}) is not mounted." - exit 1 -#else -# logger -s "DEBUG: local scratch disk (${LOCAL_SCRATCH_DIR}) is mounted." +if [[ $(stat -c '%d' "${LOCAL_SCRATCH_DIR}") -eq $(stat -c '%d' "${LOCAL_SCRATCH_DIR}/..") ]]; then + if [[ "${SLURM_JOB_QOS}" =~ ^ds.* ]]; then + # + # For the data staging QoS "ds", which executes jobs only on the UI, + # a dedicated tmp dir per job may be absent as not all UIs have a /local mount. + # + logger -s "WARN: local scratch disk (${LOCAL_SCRATCH_DIR}) is not mounted." + else + # + # Make sure we can create tmp dirs in /local on compute nodes. + # When this fails the job must not continue as SLURM will default to /tmp, + # which is not suitable for heavy random IO nor large data sets. + # Hammering /tmp may effectively result in the node going down. + # When the prolog fails the node will be set to state=DRAIN instead. + # + logger -s "FATAL: local scratch disk (${LOCAL_SCRATCH_DIR}) is not mounted." + exit 1 + fi +else + # + # Create dedicated tmp dir for this job. + # + TMPDIR="${LOCAL_SCRATCH_DIR}/${SLURM_JOB_ID}/" + #logger -s "DEBUG: local scratch disk (${LOCAL_SCRATCH_DIR}) is mounted. Trying to create ${TMPDIR} ..." + mkdir -m 700 -p "${TMPDIR}" || logger -s "FATAL: failed to create ${TMPDIR}." + chown "${SLURM_JOB_USER}" "${TMPDIR}" || logger -s "FATAL: failed to chown ${TMPDIR}." fi - -TMPDIR="${LOCAL_SCRATCH_DIR}/${SLURM_JOB_ID}/" -mkdir -m 700 -p "${TMPDIR}" || logger -s "FATAL: failed to create ${TMPDIR}." -chown "${SLURM_JOB_USER}" "${TMPDIR}" || logger -s "FATAL: failed to chown ${TMPDIR}." diff --git a/roles/slurm_management/files/slurm.taskprolog b/roles/slurm_management/files/slurm.taskprolog index c827e4167..beceded73 100644 --- a/roles/slurm_management/files/slurm.taskprolog +++ b/roles/slurm_management/files/slurm.taskprolog @@ -1,30 +1,44 @@ #!/bin/bash -# -# Make sure we have a tmp dir in /local. -# When this failed the job should not continue as SLURM will default to /tmp, -# which is not suitable for heavy random IO nor large data sets. -# Hammering /tmp may effectively result in the node going down. -# When the prolog fails the node will be set to state=DRAIN instead. -# - if [ -z "${SLURM_JOB_ID}" ]; then logger -s "FATAL: SLURM_JOB_ID is empty or unset in SLURM task prolog." exit 1 +elif [[ -z "${SLURM_JOB_QOS}" ]]; then + logger -s "FATAL: SLURM_JOB_QOS is empty or unset in SLURM task prolog." + exit 1 fi set -e set -u +# +# Make sure we have a tmp dir in /local on compute nodes. +# When this failed the job should not continue as SLURM will default to /tmp, +# which is not suitable for heavy random IO nor large data sets. +# Hammering /tmp may effectively result in the node going down. +# When the prolog fails the node will be set to state=DRAIN instead. +# +# For the data staging QoS "ds", which executes jobs only on the UI, +# a dedicated tmp dir per job may be absent as not all UIs have a /local mount. +# TMPDIR="/local/${SLURM_JOB_ID}/" - -if [ ! -d "${TMPDIR}" ]; then +if [[ ! -d "${TMPDIR}" ]] && [[ ! "${SLURM_JOB_QOS}" =~ ^ds.* ]]; then logger -s "FATAL: TMPDIR ${TMPDIR} is not available in SLURM task prolog." exit 1 +else + # + # STDOUT from this task prolog is used to initialize the job task's env, + # so we need to print the export statements to STDOUT. + # + echo "export TMPDIR=${TMPDIR}" fi # -# STDOUT from this task prolog is used to initialize the job task's env, -# so we need to print the export statements to STDOUT. +# Set TMOUT to configure automagic logout from interactive sessions +# after 30 minutes of inactivity. # -echo "export TMPDIR=${TMPDIR}" +if [[ "${SLURM_JOB_QOS}" =~ ^interactive.* ]]; then + echo "TMOUT=1800" + echo "readonly TMOUT" + echo "export TMOUT" +fi \ No newline at end of file diff --git a/roles/slurm_management/templates/configure_slurm_accounting_db.bash b/roles/slurm_management/templates/configure_slurm_accounting_db.bash index cd3fc958c..b0de5bd29 100644 --- a/roles/slurm_management/templates/configure_slurm_accounting_db.bash +++ b/roles/slurm_management/templates/configure_slurm_accounting_db.bash @@ -77,6 +77,7 @@ sacctmgr -i modify qos Name='regular-medium' set \ Priority=10 \ Preempt='leftover-short,leftover-medium,leftover-long' \ GrpSubmit=30000 MaxSubmitJobsPU=5000 MaxWall=1-00:00:00 \ + GrpTRES=cpu={{ [1, (cluster_cores_total | float * 0.6) | int] | max }},mem={{ [1000, (cluster_mem_total | float * 0.6) | int] | max }} \ MaxTRESPU=cpu={{ [1, (cluster_cores_total | float * 0.4) | int] | max }},mem={{ [1000, (cluster_mem_total | float * 0.4) | int] | max }} sacctmgr -i create qos set Name='regular-long' @@ -94,6 +95,7 @@ sacctmgr -i modify qos Name='regular-long' set \ sacctmgr -i create qos set Name='priority' sacctmgr -i modify qos Name='priority' set \ Description='High priority Quality of Service level with corresponding higher impact on your Fair Share.' \ + Preempt='leftover-short,leftover-medium,leftover-long' \ Priority=20 \ UsageFactor=2 \ GrpSubmit=5000 MaxSubmitJobsPU=1000 \ @@ -103,6 +105,7 @@ sacctmgr -i create qos set Name='priority-short' sacctmgr -i modify qos Name='priority-short' set \ Description='priority-short' \ Priority=20 \ + Preempt='leftover-short,leftover-medium,leftover-long' \ UsageFactor=2 \ GrpSubmit=5000 MaxSubmitJobsPU=1000 MaxWall=06:00:00 \ MaxTRESPU=cpu={{ [1, (cluster_cores_total | float * 0.25) | int] | max }},mem={{ [1000, (cluster_mem_total | float * 0.25) | int] | max }} @@ -111,20 +114,42 @@ sacctmgr -i create qos set Name='priority-medium' sacctmgr -i modify qos Name='priority-medium' set \ Description='priority-medium' \ Priority=20 \ + Preempt='leftover-short,leftover-medium,leftover-long' \ UsageFactor=2 \ GrpSubmit=2500 MaxSubmitJobsPU=500 MaxWall=1-00:00:00 \ - GrpTRES=cpu={{ [1, (cluster_cores_total | float * 0.5) | int] | max }},mem={{ [1000, (cluster_mem_total | float * 0.5) | int] | max }} \ + GrpTRES=cpu={{ [1, (cluster_cores_total | float * 0.6) | int] | max }},mem={{ [1000, (cluster_mem_total | float * 0.6) | int] | max }} \ MaxTRESPU=cpu={{ [1, (cluster_cores_total | float * 0.2) | int] | max }},mem={{ [1000, (cluster_mem_total | float * 0.2) | int] | max }} sacctmgr -i create qos set Name='priority-long' sacctmgr -i modify qos Name='priority-long' set \ Description='priority-long' \ Priority=20 \ + Preempt='leftover-short,leftover-medium,leftover-long' \ UsageFactor=2 \ GrpSubmit=250 MaxSubmitJobsPU=50 MaxWall=7-00:00:00 \ - GrpTRES=cpu={{ [1, (cluster_cores_total | float * 0.2) | int] | max }},mem={{ [1000, (cluster_mem_total | float * 0.2) | int] | max }} \ + GrpTRES=cpu={{ [1, (cluster_cores_total | float * 0.3) | int] | max }},mem={{ [1000, (cluster_mem_total | float * 0.3) | int] | max }} \ MaxTRESPU=cpu={{ [1, (cluster_cores_total | float * 0.1) | int] | max }},mem={{ [1000, (cluster_mem_total | float * 0.1) | int] | max }} +# +# QoS interactive +# +sacctmgr -i create qos set Name='interactive' +sacctmgr -i modify qos Name='interactive' set \ + Description='Highest priority Quality of Service level for interactive sessions.' \ + Priority=30 \ + UsageFactor=1 \ + MaxSubmitJobsPU=1 \ + GrpTRES=cpu=0,mem=0 + +sacctmgr -i create qos set Name='interactive-short' +sacctmgr -i modify qos Name='interactive-short' set \ + Description='interactive-short' \ + Priority=30 \ + Preempt='leftover-short,leftover-medium,leftover-long,regular-short' \ + UsageFactor=1 \ + MaxSubmitJobsPU=1 MaxWall=06:00:00 \ + MaxTRESPU=cpu={{ [1, (vcompute_max_cpus_per_node | float * 0.5) | int] | max }},mem={{ [1000, (vcompute_max_mem_per_node | float * 0.5) | int] | max }} + # # QoS ds # @@ -189,6 +214,9 @@ sacctmgr -i modify account root set \ sacctmgr -i modify account root set \ QOS+=ds,ds-short,ds-medium,ds-long +sacctmgr -i modify account root set \ + QOS+=interactive,interactive-short + sacctmgr -i modify account root set \ DefaultQOS=priority @@ -207,6 +235,9 @@ sacctmgr -i modify account users set \ sacctmgr -i modify account users set \ QOS+=ds,ds-short,ds-medium,ds-long +sacctmgr -i modify account users set \ + QOS+=interactive,interactive-short + sacctmgr -i modify account users set \ DefaultQOS=regular From db1355cf23d77bff893b701ac7169f2ba3a88423 Mon Sep 17 00:00:00 2001 From: pneerincx Date: Tue, 15 Jun 2021 13:59:43 +0200 Subject: [PATCH 02/10] Improved QoS settings (WIP). --- roles/slurm_management/files/slurm.epilog | 10 ++-- roles/slurm_management/files/slurm.prolog | 52 ++++++++++++------- roles/slurm_management/files/slurm.taskprolog | 40 +++++++++----- .../configure_slurm_accounting_db.bash | 35 ++++++++++++- 4 files changed, 98 insertions(+), 39 deletions(-) diff --git a/roles/slurm_management/files/slurm.epilog b/roles/slurm_management/files/slurm.epilog index c07e0abae..090d62847 100644 --- a/roles/slurm_management/files/slurm.epilog +++ b/roles/slurm_management/files/slurm.epilog @@ -1,6 +1,6 @@ #!/bin/bash -if [ -z "${SLURM_JOB_ID}" ]; then +if [[ -z "${SLURM_JOB_ID}" ]]; then logger -s "WARN: SLURM_JOB_ID is empty or unset in SLURM epilog." exit 0 fi @@ -8,8 +8,10 @@ fi # # Cleanup job's private tmp dir. # -TMPDIR="/local/${SLURM_JOB_ID}/" -rm -rf "${TMPDIR}" +TMPDIR="/local/${SLURM_JOB_ID:-missing_SLURM_JOB_ID}/" +if [[ -e "${TMPDIR}" ]]; then + rm -Rf "${TMPDIR}" +fi # # Append resource usage stats to job's *.out file if we have an STDOUT file. @@ -19,7 +21,7 @@ SCONTROL_JOB_INFO="$(scontrol show job ${SLURM_JOB_ID})" SLURM_JOB_STDOUT="$(printf '%s\n' "${SCONTROL_JOB_INFO}" | grep 'StdOut=' | sed 's/[[:space:]]*StdOut=//')" SLURM_JOB_NODE="$(printf '%s\n' "${SCONTROL_JOB_INFO}" | grep 'BatchHost=' | sed 's/[[:space:]]*BatchHost=//')" SLURM_JOB_STDOUT="$(echo "${SLURM_JOB_STDOUT}" | sed "s/%N/${SLURM_JOB_NODE}/")" -if [ -w "${SLURM_JOB_STDOUT}" ]; then +if [[ -w "${SLURM_JOB_STDOUT}" ]]; then sformat='JobId,Elapsed,AllocCPUs,AveCPU,ReqMem,MaxVMSize,MaxRSS,MaxDiskRead,MaxDiskWrite' echo '#################################################################################################################' >> "${SLURM_JOB_STDOUT}" echo '# Job details recorded by SLURM job epilog using sacct. #' >> "${SLURM_JOB_STDOUT}" diff --git a/roles/slurm_management/files/slurm.prolog b/roles/slurm_management/files/slurm.prolog index ec35c9da1..a8223eda4 100644 --- a/roles/slurm_management/files/slurm.prolog +++ b/roles/slurm_management/files/slurm.prolog @@ -1,34 +1,46 @@ #!/bin/bash -# -# Make sure we are successful in making tmp dirs in /local. -# When this failed the job should not continue as SLURM will default to /tmp, -# which is not suitable for heavy random IO nor large data sets. -# Hammering /tmp may effectively result in the node going down. -# When the prolog fails the node will be set to state=DRAIN instead. -# - -if [ -z "${SLURM_JOB_ID}" ]; then +if [[ -z "${SLURM_JOB_ID}" ]]; then logger -s "FATAL: SLURM_JOB_ID is empty or unset in SLURM prolog." exit 1 +elif [[ -z "${SLURM_JOB_QOS}" ]]; then + logger -s "FATAL: SLURM_JOB_QOS is empty or unset in SLURM prolog." + exit 1 #else -# logger -s "DEBUG: Found SLURM_JOB_ID ${SLURM_JOB_ID} in SLURM prolog." +# logger -s "DEBUG: Found SLURM_JOB_ID ${SLURM_JOB_ID} and SLURM_JOB_QOS ${SLURM_JOB_QOS} in SLURM prolog." fi set -e set -u +LOCAL_SCRATCH_DIR='/local' # # Check if local scratch dir is mountpoint and hence not a dir on the system disk. # -LOCAL_SCRATCH_DIR='/local' -if [ $(stat -c '%d' "${LOCAL_SCRATCH_DIR}") -eq $(stat -c '%d' "${LOCAL_SCRATCH_DIR}/..") ]; then - logger -s "FATAL: local scratch disk (${LOCAL_SCRATCH_DIR}) is not mounted." - exit 1 -#else -# logger -s "DEBUG: local scratch disk (${LOCAL_SCRATCH_DIR}) is mounted." +if [[ $(stat -c '%d' "${LOCAL_SCRATCH_DIR}") -eq $(stat -c '%d' "${LOCAL_SCRATCH_DIR}/..") ]]; then + if [[ "${SLURM_JOB_QOS}" =~ ^ds.* ]]; then + # + # For the data staging QoS "ds", which executes jobs only on the UI, + # a dedicated tmp dir per job may be absent as not all UIs have a /local mount. + # + logger -s "WARN: local scratch disk (${LOCAL_SCRATCH_DIR}) is not mounted." + else + # + # Make sure we can create tmp dirs in /local on compute nodes. + # When this fails the job must not continue as SLURM will default to /tmp, + # which is not suitable for heavy random IO nor large data sets. + # Hammering /tmp may effectively result in the node going down. + # When the prolog fails the node will be set to state=DRAIN instead. + # + logger -s "FATAL: local scratch disk (${LOCAL_SCRATCH_DIR}) is not mounted." + exit 1 + fi +else + # + # Create dedicated tmp dir for this job. + # + TMPDIR="${LOCAL_SCRATCH_DIR}/${SLURM_JOB_ID}/" + #logger -s "DEBUG: local scratch disk (${LOCAL_SCRATCH_DIR}) is mounted. Trying to create ${TMPDIR} ..." + mkdir -m 700 -p "${TMPDIR}" || logger -s "FATAL: failed to create ${TMPDIR}." + chown "${SLURM_JOB_USER}" "${TMPDIR}" || logger -s "FATAL: failed to chown ${TMPDIR}." fi - -TMPDIR="${LOCAL_SCRATCH_DIR}/${SLURM_JOB_ID}/" -mkdir -m 700 -p "${TMPDIR}" || logger -s "FATAL: failed to create ${TMPDIR}." -chown "${SLURM_JOB_USER}" "${TMPDIR}" || logger -s "FATAL: failed to chown ${TMPDIR}." diff --git a/roles/slurm_management/files/slurm.taskprolog b/roles/slurm_management/files/slurm.taskprolog index c827e4167..beceded73 100644 --- a/roles/slurm_management/files/slurm.taskprolog +++ b/roles/slurm_management/files/slurm.taskprolog @@ -1,30 +1,44 @@ #!/bin/bash -# -# Make sure we have a tmp dir in /local. -# When this failed the job should not continue as SLURM will default to /tmp, -# which is not suitable for heavy random IO nor large data sets. -# Hammering /tmp may effectively result in the node going down. -# When the prolog fails the node will be set to state=DRAIN instead. -# - if [ -z "${SLURM_JOB_ID}" ]; then logger -s "FATAL: SLURM_JOB_ID is empty or unset in SLURM task prolog." exit 1 +elif [[ -z "${SLURM_JOB_QOS}" ]]; then + logger -s "FATAL: SLURM_JOB_QOS is empty or unset in SLURM task prolog." + exit 1 fi set -e set -u +# +# Make sure we have a tmp dir in /local on compute nodes. +# When this failed the job should not continue as SLURM will default to /tmp, +# which is not suitable for heavy random IO nor large data sets. +# Hammering /tmp may effectively result in the node going down. +# When the prolog fails the node will be set to state=DRAIN instead. +# +# For the data staging QoS "ds", which executes jobs only on the UI, +# a dedicated tmp dir per job may be absent as not all UIs have a /local mount. +# TMPDIR="/local/${SLURM_JOB_ID}/" - -if [ ! -d "${TMPDIR}" ]; then +if [[ ! -d "${TMPDIR}" ]] && [[ ! "${SLURM_JOB_QOS}" =~ ^ds.* ]]; then logger -s "FATAL: TMPDIR ${TMPDIR} is not available in SLURM task prolog." exit 1 +else + # + # STDOUT from this task prolog is used to initialize the job task's env, + # so we need to print the export statements to STDOUT. + # + echo "export TMPDIR=${TMPDIR}" fi # -# STDOUT from this task prolog is used to initialize the job task's env, -# so we need to print the export statements to STDOUT. +# Set TMOUT to configure automagic logout from interactive sessions +# after 30 minutes of inactivity. # -echo "export TMPDIR=${TMPDIR}" +if [[ "${SLURM_JOB_QOS}" =~ ^interactive.* ]]; then + echo "TMOUT=1800" + echo "readonly TMOUT" + echo "export TMOUT" +fi \ No newline at end of file diff --git a/roles/slurm_management/templates/configure_slurm_accounting_db.bash b/roles/slurm_management/templates/configure_slurm_accounting_db.bash index cd3fc958c..b0de5bd29 100644 --- a/roles/slurm_management/templates/configure_slurm_accounting_db.bash +++ b/roles/slurm_management/templates/configure_slurm_accounting_db.bash @@ -77,6 +77,7 @@ sacctmgr -i modify qos Name='regular-medium' set \ Priority=10 \ Preempt='leftover-short,leftover-medium,leftover-long' \ GrpSubmit=30000 MaxSubmitJobsPU=5000 MaxWall=1-00:00:00 \ + GrpTRES=cpu={{ [1, (cluster_cores_total | float * 0.6) | int] | max }},mem={{ [1000, (cluster_mem_total | float * 0.6) | int] | max }} \ MaxTRESPU=cpu={{ [1, (cluster_cores_total | float * 0.4) | int] | max }},mem={{ [1000, (cluster_mem_total | float * 0.4) | int] | max }} sacctmgr -i create qos set Name='regular-long' @@ -94,6 +95,7 @@ sacctmgr -i modify qos Name='regular-long' set \ sacctmgr -i create qos set Name='priority' sacctmgr -i modify qos Name='priority' set \ Description='High priority Quality of Service level with corresponding higher impact on your Fair Share.' \ + Preempt='leftover-short,leftover-medium,leftover-long' \ Priority=20 \ UsageFactor=2 \ GrpSubmit=5000 MaxSubmitJobsPU=1000 \ @@ -103,6 +105,7 @@ sacctmgr -i create qos set Name='priority-short' sacctmgr -i modify qos Name='priority-short' set \ Description='priority-short' \ Priority=20 \ + Preempt='leftover-short,leftover-medium,leftover-long' \ UsageFactor=2 \ GrpSubmit=5000 MaxSubmitJobsPU=1000 MaxWall=06:00:00 \ MaxTRESPU=cpu={{ [1, (cluster_cores_total | float * 0.25) | int] | max }},mem={{ [1000, (cluster_mem_total | float * 0.25) | int] | max }} @@ -111,20 +114,42 @@ sacctmgr -i create qos set Name='priority-medium' sacctmgr -i modify qos Name='priority-medium' set \ Description='priority-medium' \ Priority=20 \ + Preempt='leftover-short,leftover-medium,leftover-long' \ UsageFactor=2 \ GrpSubmit=2500 MaxSubmitJobsPU=500 MaxWall=1-00:00:00 \ - GrpTRES=cpu={{ [1, (cluster_cores_total | float * 0.5) | int] | max }},mem={{ [1000, (cluster_mem_total | float * 0.5) | int] | max }} \ + GrpTRES=cpu={{ [1, (cluster_cores_total | float * 0.6) | int] | max }},mem={{ [1000, (cluster_mem_total | float * 0.6) | int] | max }} \ MaxTRESPU=cpu={{ [1, (cluster_cores_total | float * 0.2) | int] | max }},mem={{ [1000, (cluster_mem_total | float * 0.2) | int] | max }} sacctmgr -i create qos set Name='priority-long' sacctmgr -i modify qos Name='priority-long' set \ Description='priority-long' \ Priority=20 \ + Preempt='leftover-short,leftover-medium,leftover-long' \ UsageFactor=2 \ GrpSubmit=250 MaxSubmitJobsPU=50 MaxWall=7-00:00:00 \ - GrpTRES=cpu={{ [1, (cluster_cores_total | float * 0.2) | int] | max }},mem={{ [1000, (cluster_mem_total | float * 0.2) | int] | max }} \ + GrpTRES=cpu={{ [1, (cluster_cores_total | float * 0.3) | int] | max }},mem={{ [1000, (cluster_mem_total | float * 0.3) | int] | max }} \ MaxTRESPU=cpu={{ [1, (cluster_cores_total | float * 0.1) | int] | max }},mem={{ [1000, (cluster_mem_total | float * 0.1) | int] | max }} +# +# QoS interactive +# +sacctmgr -i create qos set Name='interactive' +sacctmgr -i modify qos Name='interactive' set \ + Description='Highest priority Quality of Service level for interactive sessions.' \ + Priority=30 \ + UsageFactor=1 \ + MaxSubmitJobsPU=1 \ + GrpTRES=cpu=0,mem=0 + +sacctmgr -i create qos set Name='interactive-short' +sacctmgr -i modify qos Name='interactive-short' set \ + Description='interactive-short' \ + Priority=30 \ + Preempt='leftover-short,leftover-medium,leftover-long,regular-short' \ + UsageFactor=1 \ + MaxSubmitJobsPU=1 MaxWall=06:00:00 \ + MaxTRESPU=cpu={{ [1, (vcompute_max_cpus_per_node | float * 0.5) | int] | max }},mem={{ [1000, (vcompute_max_mem_per_node | float * 0.5) | int] | max }} + # # QoS ds # @@ -189,6 +214,9 @@ sacctmgr -i modify account root set \ sacctmgr -i modify account root set \ QOS+=ds,ds-short,ds-medium,ds-long +sacctmgr -i modify account root set \ + QOS+=interactive,interactive-short + sacctmgr -i modify account root set \ DefaultQOS=priority @@ -207,6 +235,9 @@ sacctmgr -i modify account users set \ sacctmgr -i modify account users set \ QOS+=ds,ds-short,ds-medium,ds-long +sacctmgr -i modify account users set \ + QOS+=interactive,interactive-short + sacctmgr -i modify account users set \ DefaultQOS=regular From 4e6280c23c76293448ead918b4fe34f607ebc829 Mon Sep 17 00:00:00 2001 From: pneerincx Date: Fri, 18 Jun 2021 10:52:20 +0200 Subject: [PATCH 03/10] Updating documentation (WIP). --- roles/online_docs/templates/mkdocs/docs/analysis.md | 2 +- single_group_playbooks/pre_deploy_checks.yml | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/roles/online_docs/templates/mkdocs/docs/analysis.md b/roles/online_docs/templates/mkdocs/docs/analysis.md index c26dcabd1..f3598653b 100644 --- a/roles/online_docs/templates/mkdocs/docs/analysis.md +++ b/roles/online_docs/templates/mkdocs/docs/analysis.md @@ -498,7 +498,7 @@ The compute nodes of this cluster do not have local scratch disks. If your workload uses a random IO pattern that produces too much load on a shared file system, you should consider using a different algorithm or different cluster. {% endif %} -## Debugging and Frequent Asked Question (FAQs) +## Debugging and Frequent Asked Questions (FAQs) #### Q: How do I know what environment is available to my job on an execution host? diff --git a/single_group_playbooks/pre_deploy_checks.yml b/single_group_playbooks/pre_deploy_checks.yml index c071ccf9c..658b29586 100644 --- a/single_group_playbooks/pre_deploy_checks.yml +++ b/single_group_playbooks/pre_deploy_checks.yml @@ -11,3 +11,7 @@ msg: "You must update Ansible to at least {{ minimal_ansible_version }}.x to use this playbook." vars: minimal_ansible_version: 2.10 + - name: 'Verify that the group_vars were parsed.' + assert: + that: slurm_cluster_name is defined + msg: "FATAL: the slurm_cluster_name Ansible variable is undefined, which suggests that the group_vars were not parsed." From cf220a94f6492628033f177c713a5441d8a928af Mon Sep 17 00:00:00 2001 From: pneerincx Date: Tue, 22 Jun 2021 19:16:59 +0200 Subject: [PATCH 04/10] Updating documentation (WIP). --- .../templates/mkdocs/docs/analysis.md | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/roles/online_docs/templates/mkdocs/docs/analysis.md b/roles/online_docs/templates/mkdocs/docs/analysis.md index f3598653b..edf097e59 100644 --- a/roles/online_docs/templates/mkdocs/docs/analysis.md +++ b/roles/online_docs/templates/mkdocs/docs/analysis.md @@ -229,17 +229,17 @@ scancel -u [your account] When you need to interact with a running job you can start an interactive session with the [srun](http://slurm.schedmd.com/srun.html) command. This creates a shell on a compute node, which works the same as a shell on the User Interface except that the shell is restricted to the requested resources. -This is ideal for debugging/testing and prevents your work from running out of control and crashing processes from other users or vice versa. -Just like for the ```sbatch``` command you will need to request resources like amount of cores, amount of memory, work allocation time (walltime), etc. +This is ideal for debugging/testing and prevents your work from running out of control, crashing processes from other users or vice versa. +Just like for the ```sbatch``` command for batch jobs you will need to request resources like amount of cores, amount of memory, work allocation time (walltime), etc. for interactive jobs too. E.g. to request a session for one hour: ``` -srun --cpus-per-task=1 --mem=1gb --nodes=1 --qos=priority --time=01:00:00 --pty bash -i +srun --cpus-per-task=1 --mem=1gb --nodes=1 --qos=interactive --time=01:00:00 --pty bash -i ``` When the requested resources are available the interactive session will start immediately. -To increase the chance your interactive session will start quickly, even when the cluster is relatively busy, you can request _Quality of Service_ level _priority_ with ```--qos=priority```. +To increase the chance your interactive session will start quickly, even when the cluster is relatively busy, you can request _Quality of Service_ level _interactive_ with ```--qos=interactive```. **Essential**: the order of ```srun``` arguments is not important except that ```--pty bash -i``` must be last. -Any options after that are interpreted as arguments for the requested shell and not for the ```srun``` command. +Any options after ```--pty bash``` are interpreted as arguments for the requested shell and not for the ```srun``` command. Hence the ```-i``` in the example is an argument for the ```bash``` shell. When you exit the bash shell using either the ```exit``` command or by pressing ```CTRL+d``` the interactive job will be cancelled automagically and the corresponding resources released. @@ -253,6 +253,7 @@ Which job will be started next is determined based on 1. Job priority 2. Backfill to improve scheduling efficiency +3. Whether a job in the queue can preempt a running job or not. #### Job priority @@ -275,6 +276,16 @@ Slurm will start them before the higher priority _5 core for 1 hour_ job, which Please note that this is a simplified example taking only cores and time into account. In real life Slurm is playing sort of a multidimensional [Tetris](https://nl.wikipedia.org/wiki/Tetris) game taking other resources like memory into account too. +#### Job preemption + +Preemption means that a job in the queue can get resources by pushing another running job out of its way. +For the running job that gets preempted this means it will get killed and automatically rescheduled. +Unless the rescheduled job can use a smart form of check pointing to resume from where it got interrupted, +this means it will have to start all over from scratch and any resources it used up to the point it got killed & rescheduled were wasted. +Therefore preemption of short jobs can help to free up resources for high priority jobs on a busy cluster without wasting a lot, +but for long running jobs it is less suitable, because the longer the walltime the higher the chance it gets preempted and hence the more resources got wasted. + + ## Quality of Service We use 5 Quality of Service (QoS) levels with 3 QoS sub-levels each. From 6ec00561cb13dd55fcfa2376bbf544365314489b Mon Sep 17 00:00:00 2001 From: pneerincx Date: Tue, 22 Jun 2021 19:18:05 +0200 Subject: [PATCH 05/10] Tweaked SchedulerParameters in slurm.conf. --- roles/slurm_management/templates/slurm.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/slurm_management/templates/slurm.conf b/roles/slurm_management/templates/slurm.conf index 37fc3e487..585d1416b 100644 --- a/roles/slurm_management/templates/slurm.conf +++ b/roles/slurm_management/templates/slurm.conf @@ -65,7 +65,7 @@ Waittime=15 # SchedulerType=sched/backfill SchedulerPort=7321 -SchedulerParameters=kill_invalid_depend,bf_continue,bf_max_job_test=10000,bf_max_job_user=5000,default_queue_depth=500,bf_window=10080,bf_resolution=300,preempt_reorder_count=100 +SchedulerParameters=kill_invalid_depend,bf_continue,bf_max_job_test=10000,bf_max_job_user=5000,default_queue_depth=1000,bf_window=10080,bf_resolution=300,bf_busy_nodes,preempt_reorder_count=100,preempt_youngest_first SelectType=select/cons_res SelectTypeParameters=CR_Core_Memory #SchedulerAuth= From 0fb0c2e3e05197a454f13b0ab8d8f832c33bf470 Mon Sep 17 00:00:00 2001 From: pneerincx Date: Mon, 28 Jun 2021 15:42:51 +0200 Subject: [PATCH 06/10] Updated online documentation for new QoS level "interactive". --- .../templates/mkdocs/docs/analysis.md | 42 +++++++++++++------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/roles/online_docs/templates/mkdocs/docs/analysis.md b/roles/online_docs/templates/mkdocs/docs/analysis.md index edf097e59..c91b375d7 100644 --- a/roles/online_docs/templates/mkdocs/docs/analysis.md +++ b/roles/online_docs/templates/mkdocs/docs/analysis.md @@ -283,8 +283,7 @@ For the running job that gets preempted this means it will get killed and automa Unless the rescheduled job can use a smart form of check pointing to resume from where it got interrupted, this means it will have to start all over from scratch and any resources it used up to the point it got killed & rescheduled were wasted. Therefore preemption of short jobs can help to free up resources for high priority jobs on a busy cluster without wasting a lot, -but for long running jobs it is less suitable, because the longer the walltime the higher the chance it gets preempted and hence the more resources got wasted. - +but for long running jobs it is less suitable, because the longer the walltime, the higher the chance it gets preempted and hence the more resources got wasted. ## Quality of Service @@ -295,6 +294,7 @@ The base QoS levels are determined by the users and these allow you to different * jobs with lower versus higher priority * high performance computing versus data staging jobs + * batch jobs versus interactive jobs ![QoS](img/slurm_qos.svg) @@ -304,13 +304,15 @@ By specifying a QoS level with higher priority you can request Slurm to re-order #### QoS levels -| QoS | Priority | Usage factor | Available resources | Shared Storage | -|:---------- |:----------- |:---------------------- |:--------------------------------------------- |:---------------| -| leftover | 0 | none | Lots, up to the whole cluster for short jobs. | tmp only | -| regular | default | default | Quite some, but never the whole cluster. | tmp only | -| priority | default x 2 | default x 2 | Just a few, max ~ 25 percent of the cluster. | tmp only | -| panic mode | default x 2 | default x 2 | Occasionally: Just a few. | tmp only | -| ds | default | default | Minimal: max 1 core + 1GB mem per job. | tmp and prm | +| QoS | Priority | Usage factor | Available resources | Shared Storage | +|:----------- |:----------- |:------------- |:--------------------------------------------- |:---------------| +| leftover | 0 | none | Lots, up to the whole cluster for short jobs. | tmp only | +| regular | default | default | Quite some, but never the whole cluster. | tmp only | +| priority | default x 2 | default x 2 | Just a few, max ~ 25 percent of the cluster. | tmp only | +| panic mode | default x 2 | default x 2 | Occasionally: Just a few. | tmp only | +| interactive | default x 3 | default | Minimal: max 1 job per user. | tmp only | +| ds | default | default | Minimal: max 1 core + 1GB mem per job. | tmp and prm | + Recent jobs determine your _fair share_ weight when calculating job priority: The more resources you recently consumed the lower your priority for new jobs. @@ -338,12 +340,15 @@ You are a cheapskate and decided to go Dutch. You'll consume whatever resources are _leftover_ and will accept lowest priority for your jobs. The _usage factor_ is zero, so any resources consumed using this QoS level will not impact your _fair share_, which is used for calculating job priority. +Jobs from all other QoS levels can preempt jobs in QoS level _leftover_. It may take some time for this research project to complete, but hey you got it for free! #### 2. QoS regular No goofy exceptions; this is the default when no QoS level is requested explicitly. -Running with this QoS level will process jobs with standard priority and count for your _fair share_ accordingly. +Running with this QoS level will process jobs with standard priority and count for your _fair share_ accordingly. +Medium and long running jobs cannot get preempted: once started, they will be allowed to finish +no matter how busy the cluster is. Short jobs may get preempted, but only by jobs in QoS _interactive_. #### 3. QoS priority @@ -351,7 +356,8 @@ You are working on multiple projects simultaneously and have a lot of jobs in th but are eager to get the results for jobs submitted with this QoS level first. The total amount of resources available to this QoS level is limited and your _fair share_ factor is charged double the amount of (normalised) resources as compared to when using QoS ```regular```, -so choose wisely what you submit with QoS level ```priority```. +so choose wisely what you submit with QoS level ```priority```. +Jobs cannot get preempted by others: once started, they will be allowed to finish. #### 4. QoS panic mode @@ -376,7 +382,19 @@ the following rules apply: Using these additional resources we can then either increase the capacity to process jobs faster using QoS level ```regular``` or create a dedicated QoS level with increased _fair share_ ratio depending on investment. (minimal investment 10K euro) -#### 5. QoS ds +#### 5. QoS interactive + +A dedicated QoS level for interactive jobs. These jobs will get super mega hyper priority as staring at a terminal waiting for a session to start isn't fun. +You can have only one job in QoS _interactive_ otherwise it would not be interactive anymore. +There is no _medium_ nor _long_ QoS sub-level for interactive jobs: +if you need more than 6 hours it is either no longer interactive work or it is not healthy and you need to get yourself a break! +Jobs in QoS _interactive-short_ cannot get preempted themselves and can preempt jobs in QoS _regular-short_ & _leftover-*_. +Interactive jobs will have a bash ```${TMOUT}``` environment variable set to 30 minutes, so you can get a quick coffee break, +but do not try to keep in-active *interactive* sessions alive by running silly programs that waste CPU cycles: +Logout if you go to a meeting and start a new interactive job when you get back instead. +Wasting more than 30 minutes worth of resources in this QoS may lead to a temporary ban. + +#### 6. QoS ds QoS dedicated for **d**ata **s**taging and the only one where jobs can access both _tmp_ as well as _prm_ shared storage systems. To prevent abuse jobs can only use a single core and 1 GB memory max, From ecb1f1ada833d7a438d4ca53cb79aa14d46f4e65 Mon Sep 17 00:00:00 2001 From: pneerincx Date: Mon, 28 Jun 2021 15:47:49 +0200 Subject: [PATCH 07/10] Updated online documentation for new QoS level "interactive". --- .../templates/mkdocs/docs/analysis.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/roles/online_docs/templates/mkdocs/docs/analysis.md b/roles/online_docs/templates/mkdocs/docs/analysis.md index c91b375d7..d8e391c23 100644 --- a/roles/online_docs/templates/mkdocs/docs/analysis.md +++ b/roles/online_docs/templates/mkdocs/docs/analysis.md @@ -304,14 +304,14 @@ By specifying a QoS level with higher priority you can request Slurm to re-order #### QoS levels -| QoS | Priority | Usage factor | Available resources | Shared Storage | -|:----------- |:----------- |:------------- |:--------------------------------------------- |:---------------| -| leftover | 0 | none | Lots, up to the whole cluster for short jobs. | tmp only | -| regular | default | default | Quite some, but never the whole cluster. | tmp only | -| priority | default x 2 | default x 2 | Just a few, max ~ 25 percent of the cluster. | tmp only | -| panic mode | default x 2 | default x 2 | Occasionally: Just a few. | tmp only | -| interactive | default x 3 | default | Minimal: max 1 job per user. | tmp only | -| ds | default | default | Minimal: max 1 core + 1GB mem per job. | tmp and prm | +| QoS | Priority | Usage Factor | Available Resources | Shared Storage | Preemptable Jobs | +|:----------- |:----------- |:------------- |:--------------------------------------------- |:-------------- |:---------------- | +| leftover | 0 | none | Lots, up to the whole cluster for short jobs. | tmp only | Yes | +| regular | default | default | Quite some, but never the whole cluster. | tmp only | Only short jobs | +| priority | default x 2 | default x 2 | Just a few, max ~ 25 percent of the cluster. | tmp only | No | +| panic mode | default x 2 | default x 2 | Occasionally: Just a few. | tmp only | No | +| interactive | default x 3 | default | Minimal: max 1 job per user. | tmp only | No | +| ds | default | default | Minimal: max 1 core + 1GB mem per job. | tmp and prm | No | Recent jobs determine your _fair share_ weight when calculating job priority: From 4be62e4e5cf68ca526d03a670f63f1f7a544b5bd Mon Sep 17 00:00:00 2001 From: pneerincx Date: Mon, 28 Jun 2021 16:49:34 +0200 Subject: [PATCH 08/10] Fixed formatting in docs. --- roles/online_docs/templates/mkdocs/docs/analysis.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/roles/online_docs/templates/mkdocs/docs/analysis.md b/roles/online_docs/templates/mkdocs/docs/analysis.md index d8e391c23..464519f66 100644 --- a/roles/online_docs/templates/mkdocs/docs/analysis.md +++ b/roles/online_docs/templates/mkdocs/docs/analysis.md @@ -388,9 +388,9 @@ A dedicated QoS level for interactive jobs. These jobs will get super mega hyper You can have only one job in QoS _interactive_ otherwise it would not be interactive anymore. There is no _medium_ nor _long_ QoS sub-level for interactive jobs: if you need more than 6 hours it is either no longer interactive work or it is not healthy and you need to get yourself a break! -Jobs in QoS _interactive-short_ cannot get preempted themselves and can preempt jobs in QoS _regular-short_ & _leftover-*_. +Jobs in QoS _interactive-short_ cannot get preempted themselves and can preempt jobs in QoS _regular-short_ & _leftover_. Interactive jobs will have a bash ```${TMOUT}``` environment variable set to 30 minutes, so you can get a quick coffee break, -but do not try to keep in-active *interactive* sessions alive by running silly programs that waste CPU cycles: +but do not try to keep in-active _interactive_ sessions alive by running silly programs that waste CPU cycles: Logout if you go to a meeting and start a new interactive job when you get back instead. Wasting more than 30 minutes worth of resources in this QoS may lead to a temporary ban. From 7ac03323a46b660f9e574d9e3cb9951117e116b0 Mon Sep 17 00:00:00 2001 From: pneerincx Date: Mon, 28 Jun 2021 16:52:43 +0200 Subject: [PATCH 09/10] Fixed formatting in docs. --- roles/online_docs/templates/mkdocs/docs/analysis.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/online_docs/templates/mkdocs/docs/analysis.md b/roles/online_docs/templates/mkdocs/docs/analysis.md index 464519f66..42870ed5e 100644 --- a/roles/online_docs/templates/mkdocs/docs/analysis.md +++ b/roles/online_docs/templates/mkdocs/docs/analysis.md @@ -287,7 +287,7 @@ but for long running jobs it is less suitable, because the longer the walltime, ## Quality of Service -We use 5 Quality of Service (QoS) levels with 3 QoS sub-levels each. +We use 6 Quality of Service (QoS) levels with 3 QoS sub-levels each. The QoS sub-levels are automagically determined by the system to differentiate between short versus medium versus long running jobs and enforce limits on resources available to the latter to prevent long running jobs from hogging the complete cluster. The base QoS levels are determined by the users and these allow you to differentiate between: From d2c95bd962e325e0798388b9830c5371fd8bed50 Mon Sep 17 00:00:00 2001 From: pneerincx Date: Mon, 28 Jun 2021 18:45:53 +0200 Subject: [PATCH 10/10] Removed preempt from main/routing QoS "priority". --- .../templates/configure_slurm_accounting_db.bash | 1 - 1 file changed, 1 deletion(-) diff --git a/roles/slurm_management/templates/configure_slurm_accounting_db.bash b/roles/slurm_management/templates/configure_slurm_accounting_db.bash index b0de5bd29..a3fa1caa6 100644 --- a/roles/slurm_management/templates/configure_slurm_accounting_db.bash +++ b/roles/slurm_management/templates/configure_slurm_accounting_db.bash @@ -95,7 +95,6 @@ sacctmgr -i modify qos Name='regular-long' set \ sacctmgr -i create qos set Name='priority' sacctmgr -i modify qos Name='priority' set \ Description='High priority Quality of Service level with corresponding higher impact on your Fair Share.' \ - Preempt='leftover-short,leftover-medium,leftover-long' \ Priority=20 \ UsageFactor=2 \ GrpSubmit=5000 MaxSubmitJobsPU=1000 \