From 9d59dfd12c80d46bb085c8fc6620ec1f7392623c Mon Sep 17 00:00:00 2001 From: pneerincx Date: Tue, 29 Jun 2021 15:24:49 +0200 Subject: [PATCH 1/2] Fixed Slurm prolog, where $SLURM_JOB_QOS does not exist. --- roles/slurm_management/files/slurm.prolog | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/roles/slurm_management/files/slurm.prolog b/roles/slurm_management/files/slurm.prolog index a8223eda4..13c7ae444 100644 --- a/roles/slurm_management/files/slurm.prolog +++ b/roles/slurm_management/files/slurm.prolog @@ -3,9 +3,6 @@ if [[ -z "${SLURM_JOB_ID}" ]]; then logger -s "FATAL: SLURM_JOB_ID is empty or unset in SLURM prolog." exit 1 -elif [[ -z "${SLURM_JOB_QOS}" ]]; then - logger -s "FATAL: SLURM_JOB_QOS is empty or unset in SLURM prolog." - exit 1 #else # logger -s "DEBUG: Found SLURM_JOB_ID ${SLURM_JOB_ID} and SLURM_JOB_QOS ${SLURM_JOB_QOS} in SLURM prolog." fi @@ -18,23 +15,7 @@ LOCAL_SCRATCH_DIR='/local' # Check if local scratch dir is mountpoint and hence not a dir on the system disk. # if [[ $(stat -c '%d' "${LOCAL_SCRATCH_DIR}") -eq $(stat -c '%d' "${LOCAL_SCRATCH_DIR}/..") ]]; then - if [[ "${SLURM_JOB_QOS}" =~ ^ds.* ]]; then - # - # For the data staging QoS "ds", which executes jobs only on the UI, - # a dedicated tmp dir per job may be absent as not all UIs have a /local mount. - # - logger -s "WARN: local scratch disk (${LOCAL_SCRATCH_DIR}) is not mounted." - else - # - # Make sure we can create tmp dirs in /local on compute nodes. - # When this fails the job must not continue as SLURM will default to /tmp, - # which is not suitable for heavy random IO nor large data sets. - # Hammering /tmp may effectively result in the node going down. - # When the prolog fails the node will be set to state=DRAIN instead. - # - logger -s "FATAL: local scratch disk (${LOCAL_SCRATCH_DIR}) is not mounted." - exit 1 - fi + logger -s "WARN: local scratch disk (${LOCAL_SCRATCH_DIR}) for Slurm jobs is not mounted/available." else # # Create dedicated tmp dir for this job. From 5e0ef343a984890d7928ff92ef4c909dbd705364 Mon Sep 17 00:00:00 2001 From: pneerincx Date: Tue, 29 Jun 2021 16:11:44 +0200 Subject: [PATCH 2/2] Fixed config errors for Talos that would result in NHC taking nodes offline. --- group_vars/talos_cluster/vars.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/group_vars/talos_cluster/vars.yml b/group_vars/talos_cluster/vars.yml index 88eac1339..f68289abc 100644 --- a/group_vars/talos_cluster/vars.yml +++ b/group_vars/talos_cluster/vars.yml @@ -35,23 +35,23 @@ additional_etc_hosts: vcompute_hostnames: "{{ stack_prefix }}-vcompute[01-03]" vcompute_sockets: 4 vcompute_cores_per_socket: 1 -vcompute_real_memory: 7821 +vcompute_real_memory: 7820 vcompute_max_cpus_per_node: "{{ vcompute_sockets * vcompute_cores_per_socket - 2 }}" vcompute_max_mem_per_node: "{{ vcompute_real_memory - vcompute_sockets * vcompute_cores_per_socket * 512 }}" vcompute_local_disk: 270000 vcompute_features: 'tmp08' vcompute_ethernet_interfaces: - - 'eth0' - - 'eth1' + - 'vlan983' + - 'vlan985.isilon' ui_hostnames: "{{ slurm_cluster_name }}" ui_sockets: 4 ui_cores_per_socket: 1 -ui_real_memory: 7821 +ui_real_memory: 7820 ui_local_disk: 0 ui_features: 'prm08,tmp08' ui_ethernet_interfaces: - - 'eth0' - - 'eth1' + - 'vlan983' + - 'vlan985.isilon' ssh_host_signer_ca_private_key: "{{ ssh_host_signer_ca_keypair_dir }}/umcg-hpc-development-ca" use_ldap: yes create_ldap: no