From 383e2bfa11070461b4be37371ab7d59f43743e70 Mon Sep 17 00:00:00 2001 From: R-Fehler <36566250+R-Fehler@users.noreply.github.com> Date: Wed, 3 Apr 2024 16:30:54 +0200 Subject: [PATCH 1/2] handle missing scontrol dependency for SLURM --- mmengine/dist/utils.py | 43 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/mmengine/dist/utils.py b/mmengine/dist/utils.py index 3c136973bb..76efb3c9c0 100644 --- a/mmengine/dist/utils.py +++ b/mmengine/dist/utils.py @@ -171,6 +171,42 @@ def _init_dist_mpi(backend, **kwargs) -> None: os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK'] torch_dist.init_process_group(backend=backend, **kwargs) +def _slurm_extract_first_node(slurm_nodelist): + ''' + fix needed for containers without scontrol as available executable + offers the same functionality as f'scontrol show hostname {node_list} | head -n1' + returns the first hostname in the nodelist to be used as address + ''' + # Regular expression to extract node name and range + pattern = re.compile(r'([a-zA-Z]+)(?:\[([0-9,-]+)\])?') + + # Find matches in the SLURM_NODELIST string + matches = pattern.findall(slurm_nodelist) + + if not matches: + raise ValueError("Invalid SLURM_NODELIST format") + + # Extract node name and ranges + node_prefix, node_ranges = matches[0] + + # Split the ranges by commas + ranges = node_ranges.split(',') + + # Extract the first number in each range + first_numbers = [r.split('-')[0] for r in ranges] + + first_node_name = node_prefix + first_numbers[0] + return first_node_name + + +def is_scontrol_available(): + try: + subprocess.run(["scontrol", "-h"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) + return True + except subprocess.CalledProcessError: + return False + except FileNotFoundError: + return False def _init_dist_slurm(backend, port=None, @@ -196,8 +232,11 @@ def _init_dist_slurm(backend, else: num_gpus = torch.cuda.device_count() local_rank = proc_id % num_gpus - addr = subprocess.getoutput( - f'scontrol show hostname {node_list} | head -n1') + if (is_scontrol_available()): + addr = subprocess.getoutput( + f'scontrol show hostname {node_list} | head -n1') + else: + addr = _slurm_extract_first_node(node_list) # specify master port if port is not None: os.environ['MASTER_PORT'] = str(port) From cafc4aafd71ebc6d0f89f636787216344b058082 Mon Sep 17 00:00:00 2001 From: R-Fehler <36566250+R-Fehler@users.noreply.github.com> Date: Wed, 3 Apr 2024 16:43:30 +0200 Subject: [PATCH 2/2] private function naming --- mmengine/dist/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mmengine/dist/utils.py b/mmengine/dist/utils.py index 76efb3c9c0..b474470ea9 100644 --- a/mmengine/dist/utils.py +++ b/mmengine/dist/utils.py @@ -199,7 +199,7 @@ def _slurm_extract_first_node(slurm_nodelist): return first_node_name -def is_scontrol_available(): +def _is_scontrol_available(): try: subprocess.run(["scontrol", "-h"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) return True @@ -232,7 +232,7 @@ def _init_dist_slurm(backend, else: num_gpus = torch.cuda.device_count() local_rank = proc_id % num_gpus - if (is_scontrol_available()): + if (_is_scontrol_available()): addr = subprocess.getoutput( f'scontrol show hostname {node_list} | head -n1') else: