Skip to content

Commit

Permalink
Merge pull request #424 from pneerincx/fix/slurm_prolog_failure
Browse files Browse the repository at this point in the history
Slurm configuration fixes.
  • Loading branch information
erijpkema authored Jun 29, 2021
2 parents dfe8adf + 5e0ef34 commit c712f76
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 26 deletions.
12 changes: 6 additions & 6 deletions group_vars/talos_cluster/vars.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,23 +35,23 @@ additional_etc_hosts:
vcompute_hostnames: "{{ stack_prefix }}-vcompute[01-03]"
vcompute_sockets: 4
vcompute_cores_per_socket: 1
vcompute_real_memory: 7821
vcompute_real_memory: 7820
vcompute_max_cpus_per_node: "{{ vcompute_sockets * vcompute_cores_per_socket - 2 }}"
vcompute_max_mem_per_node: "{{ vcompute_real_memory - vcompute_sockets * vcompute_cores_per_socket * 512 }}"
vcompute_local_disk: 270000
vcompute_features: 'tmp08'
vcompute_ethernet_interfaces:
- 'eth0'
- 'eth1'
- 'vlan983'
- 'vlan985.isilon'
ui_hostnames: "{{ slurm_cluster_name }}"
ui_sockets: 4
ui_cores_per_socket: 1
ui_real_memory: 7821
ui_real_memory: 7820
ui_local_disk: 0
ui_features: 'prm08,tmp08'
ui_ethernet_interfaces:
- 'eth0'
- 'eth1'
- 'vlan983'
- 'vlan985.isilon'
ssh_host_signer_ca_private_key: "{{ ssh_host_signer_ca_keypair_dir }}/umcg-hpc-development-ca"
use_ldap: yes
create_ldap: no
Expand Down
21 changes: 1 addition & 20 deletions roles/slurm_management/files/slurm.prolog
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@
if [[ -z "${SLURM_JOB_ID}" ]]; then
logger -s "FATAL: SLURM_JOB_ID is empty or unset in SLURM prolog."
exit 1
elif [[ -z "${SLURM_JOB_QOS}" ]]; then
logger -s "FATAL: SLURM_JOB_QOS is empty or unset in SLURM prolog."
exit 1
#else
# logger -s "DEBUG: Found SLURM_JOB_ID ${SLURM_JOB_ID} and SLURM_JOB_QOS ${SLURM_JOB_QOS} in SLURM prolog."
fi
Expand All @@ -18,23 +15,7 @@ LOCAL_SCRATCH_DIR='/local'
# Check if local scratch dir is mountpoint and hence not a dir on the system disk.
#
if [[ $(stat -c '%d' "${LOCAL_SCRATCH_DIR}") -eq $(stat -c '%d' "${LOCAL_SCRATCH_DIR}/..") ]]; then
if [[ "${SLURM_JOB_QOS}" =~ ^ds.* ]]; then
#
# For the data staging QoS "ds", which executes jobs only on the UI,
# a dedicated tmp dir per job may be absent as not all UIs have a /local mount.
#
logger -s "WARN: local scratch disk (${LOCAL_SCRATCH_DIR}) is not mounted."
else
#
# Make sure we can create tmp dirs in /local on compute nodes.
# When this fails the job must not continue as SLURM will default to /tmp,
# which is not suitable for heavy random IO nor large data sets.
# Hammering /tmp may effectively result in the node going down.
# When the prolog fails the node will be set to state=DRAIN instead.
#
logger -s "FATAL: local scratch disk (${LOCAL_SCRATCH_DIR}) is not mounted."
exit 1
fi
logger -s "WARN: local scratch disk (${LOCAL_SCRATCH_DIR}) for Slurm jobs is not mounted/available."
else
#
# Create dedicated tmp dir for this job.
Expand Down

0 comments on commit c712f76

Please sign in to comment.