From a8803448e647261ea4c29a9937922f4c832bec27 Mon Sep 17 00:00:00 2001 From: crivella Date: Mon, 15 Jul 2024 11:15:04 +0200 Subject: [PATCH 01/11] Added ReFrame test for MetalWalls --- eessi/testsuite/tests/apps/MetalWalls.py | 103 +++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 eessi/testsuite/tests/apps/MetalWalls.py diff --git a/eessi/testsuite/tests/apps/MetalWalls.py b/eessi/testsuite/tests/apps/MetalWalls.py new file mode 100644 index 00000000..f3ba1b7a --- /dev/null +++ b/eessi/testsuite/tests/apps/MetalWalls.py @@ -0,0 +1,103 @@ +""" +This module tests the binary 'mw' in available modules containing substring 'MetalWalls'. +Test input files are defined in MetalWalls's repo under hackathonGPU/benchmark*, +see https://github.com/reframe-hpc/reframe/blob/develop/hpctestlib/sciapps/qespresso/benchmarks.py + +ReFrame terminology: + +"pipeline stages": +https://reframe-hpc.readthedocs.io/en/stable/regression_test_api.html#pipeline-hooks + +"test parameter": a list of values, which will generate different test variants. +https://reframe-hpc.readthedocs.io/en/stable/regression_test_api.html#reframe.core.builtins.parameter + +"test variant": a version of a test with a specific value for each test parameter +https://reframe-hpc.readthedocs.io/en/stable/regression_test_api.html#test-variants + +"concrete test cases": all test combinations that will actually run: +- test variants +- valid system:partition+programming environment combinations +https://reframe-hpc.readthedocs.io/en/stable/tutorial_deps.html#listing-dependencies + +Tests can be filtered by name, tag, programming environment, system, partition, or maintainer, +see https://reframe-hpc.readthedocs.io/en/stable/manpage.html#test-filtering + +Hooks acting on all possible test combinations (before filtering) are called after the 'init' stage. +Hooks acting on concrete test cases (after filtering) are called after the 'setup' stage. + +See also https://reframe-hpc.readthedocs.io/en/stable/pipeline.html +""" +import reframe as rfm +from hpctestlib.sciapps.metalwalls.benchmarks import MetalWallsCheck +from reframe.core.builtins import run_after +from reframe.core.parameters import TestParam as parameter + +from eessi.testsuite import hooks +from eessi.testsuite.constants import (COMPUTE_UNIT, CPU, DEVICE_TYPES, GPU, + SCALES, TAGS) +from eessi.testsuite.utils import find_modules, log + + +@rfm.simple_test +class EESSI_MetalWalls_MW(MetalWallsCheck): + """MetalWalls benchmark tests. + + `MetalWalls `__ """ + + scale = parameter(SCALES.keys()) + + valid_systems = ['*'] + valid_prog_environs = ['default'] + time_limit = '30m' + + module_name = parameter(find_modules('MetalWalls')) + # For now, MetalWalls is being build for CPU targets only + # compute_device = parameter([DEVICE_TYPES[CPU], DEVICE_TYPES[GPU]]) + compute_device = parameter([DEVICE_TYPES[CPU], ]) + + @run_after('init') + def run_after_init(self): + """Hooks to run after the init phase""" + + # Filter on which scales are supported by the partitions defined in the ReFrame configuration + hooks.filter_supported_scales(self) + + # Make sure that GPU tests run in partitions that support running on a GPU, + # and that CPU-only tests run in partitions that support running CPU-only. + # Also support setting valid_systems on the cmd line. + hooks.filter_valid_systems_by_device_type(self, required_device_type=self.compute_device) + + # Support selecting modules on the cmd line. + hooks.set_modules(self) + + # Support selecting scales on the cmd line via tags. + hooks.set_tag_scale(self) + + @run_after('init') + def set_tag_ci(self): + """Set tag CI on smallest benchmark, so it can be selected on the cmd line via --tag CI""" + if self.benchmark_info[0] == 'hackathonGPU/benchmark': + self.tags.add(TAGS['CI']) + log(f'tags set to {self.tags}') + + @run_after('setup') + def run_after_setup(self): + """Hooks to run after the setup phase""" + + # Calculate default requested resources based on the scale: + # 1 task per CPU for CPU-only tests, 1 task per GPU for GPU tests. + # Also support setting the resources on the cmd line. + if self.compute_device == DEVICE_TYPES[GPU]: + hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[GPU]) + else: + hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[CPU]) + + @run_after('setup') + def set_omp_num_threads(self): + """ + Set number of OpenMP threads via OMP_NUM_THREADS. + Set default number of OpenMP threads equal to number of CPUs per task. + """ + + self.env_vars['OMP_NUM_THREADS'] = self.num_cpus_per_task + log(f'env_vars set to {self.env_vars}') \ No newline at end of file From 66262819622d01f0a9f255e17b6e99ed2f6f3365 Mon Sep 17 00:00:00 2001 From: crivella Date: Mon, 15 Jul 2024 12:34:47 +0200 Subject: [PATCH 02/11] PEP --- eessi/testsuite/tests/apps/MetalWalls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi/testsuite/tests/apps/MetalWalls.py b/eessi/testsuite/tests/apps/MetalWalls.py index f3ba1b7a..76fd6d3e 100644 --- a/eessi/testsuite/tests/apps/MetalWalls.py +++ b/eessi/testsuite/tests/apps/MetalWalls.py @@ -100,4 +100,4 @@ def set_omp_num_threads(self): """ self.env_vars['OMP_NUM_THREADS'] = self.num_cpus_per_task - log(f'env_vars set to {self.env_vars}') \ No newline at end of file + log(f'env_vars set to {self.env_vars}') From 06f90c96b9c7e6f01a470a4122cbb4b68117e881 Mon Sep 17 00:00:00 2001 From: crivella Date: Thu, 8 Aug 2024 17:02:58 +0200 Subject: [PATCH 03/11] Added process binding and memory requirements --- eessi/testsuite/tests/apps/MetalWalls.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/eessi/testsuite/tests/apps/MetalWalls.py b/eessi/testsuite/tests/apps/MetalWalls.py index 76fd6d3e..94c9fed9 100644 --- a/eessi/testsuite/tests/apps/MetalWalls.py +++ b/eessi/testsuite/tests/apps/MetalWalls.py @@ -92,6 +92,18 @@ def run_after_setup(self): else: hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[CPU]) + @run_after('setup') + def set_binding(self): + hooks.set_compact_process_binding(self) + + @run_after('setup') + def request_mem(self): + mem_per_task = 0.4 + if self.benchmark_info[0] == 'hackathonGPU/benchmark5': + mem_per_task = 1.2 + memory_required = self.num_tasks_per_node * mem_per_task + 2 + hooks.req_memory_per_node(test=self, app_mem_req=memory_required * 1024) + @run_after('setup') def set_omp_num_threads(self): """ From 77fa436428373ae10bc916890918153106aa7a33 Mon Sep 17 00:00:00 2001 From: crivella Date: Thu, 29 Aug 2024 13:41:39 +0200 Subject: [PATCH 04/11] Docstrings and skip test if corecnt > 256 --- eessi/testsuite/tests/apps/MetalWalls.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/eessi/testsuite/tests/apps/MetalWalls.py b/eessi/testsuite/tests/apps/MetalWalls.py index 94c9fed9..9a2af792 100644 --- a/eessi/testsuite/tests/apps/MetalWalls.py +++ b/eessi/testsuite/tests/apps/MetalWalls.py @@ -94,16 +94,27 @@ def run_after_setup(self): @run_after('setup') def set_binding(self): + """Set binding to compact to improve performance reproducibility.""" hooks.set_compact_process_binding(self) @run_after('setup') def request_mem(self): + """Request memory per node based on the benchmark.""" mem_per_task = 0.4 if self.benchmark_info[0] == 'hackathonGPU/benchmark5': mem_per_task = 1.2 memory_required = self.num_tasks_per_node * mem_per_task + 2 hooks.req_memory_per_node(test=self, app_mem_req=memory_required * 1024) + @run_after('setup') + def skip_max_corecnt(self): + """Skip tests if number of tasks per node exceeds maximum core count.""" + max_corecnt = 256 + self.skip_if( + self.num_tasks > max_corecnt, + f'Number of tasks per node {self.num_tasks} exceeds maximum core count {max_corecnt} for {self.bench_name}' + ) + @run_after('setup') def set_omp_num_threads(self): """ From 9aeeb9961f24eca35885078ada995b6913607453 Mon Sep 17 00:00:00 2001 From: crivella Date: Tue, 3 Sep 2024 15:57:28 +0200 Subject: [PATCH 05/11] Replaced setting OMP_NUM_THREADS with new hook --- eessi/testsuite/tests/apps/MetalWalls.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/eessi/testsuite/tests/apps/MetalWalls.py b/eessi/testsuite/tests/apps/MetalWalls.py index 9a2af792..bfeaa0e7 100644 --- a/eessi/testsuite/tests/apps/MetalWalls.py +++ b/eessi/testsuite/tests/apps/MetalWalls.py @@ -121,6 +121,4 @@ def set_omp_num_threads(self): Set number of OpenMP threads via OMP_NUM_THREADS. Set default number of OpenMP threads equal to number of CPUs per task. """ - - self.env_vars['OMP_NUM_THREADS'] = self.num_cpus_per_task - log(f'env_vars set to {self.env_vars}') + hooks.set_omp_num_threads(self) From da17ec85965830c322a4d15ec2965f6006038a54 Mon Sep 17 00:00:00 2001 From: crivella Date: Mon, 23 Sep 2024 10:40:27 +0200 Subject: [PATCH 06/11] Fixed error --- eessi/testsuite/tests/apps/MetalWalls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi/testsuite/tests/apps/MetalWalls.py b/eessi/testsuite/tests/apps/MetalWalls.py index bfeaa0e7..057a294a 100644 --- a/eessi/testsuite/tests/apps/MetalWalls.py +++ b/eessi/testsuite/tests/apps/MetalWalls.py @@ -112,7 +112,7 @@ def skip_max_corecnt(self): max_corecnt = 256 self.skip_if( self.num_tasks > max_corecnt, - f'Number of tasks per node {self.num_tasks} exceeds maximum core count {max_corecnt} for {self.bench_name}' + f'Number of tasks per node {self.num_tasks} exceeds maximum core count {max_corecnt} for {self.benchmark_info[0]}' ) @run_after('setup') From 3aa9f346ca5e486358a32366935bbc3ff7fa437f Mon Sep 17 00:00:00 2001 From: crivella Date: Mon, 23 Sep 2024 10:42:54 +0200 Subject: [PATCH 07/11] linting --- eessi/testsuite/tests/apps/MetalWalls.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/eessi/testsuite/tests/apps/MetalWalls.py b/eessi/testsuite/tests/apps/MetalWalls.py index 057a294a..989cc92b 100644 --- a/eessi/testsuite/tests/apps/MetalWalls.py +++ b/eessi/testsuite/tests/apps/MetalWalls.py @@ -110,9 +110,10 @@ def request_mem(self): def skip_max_corecnt(self): """Skip tests if number of tasks per node exceeds maximum core count.""" max_corecnt = 256 + bench_name = self.benchmark_info[0] self.skip_if( self.num_tasks > max_corecnt, - f'Number of tasks per node {self.num_tasks} exceeds maximum core count {max_corecnt} for {self.benchmark_info[0]}' + f'Number of tasks per node {self.num_tasks} exceeds maximum core count {max_corecnt} for {bench_name}' ) @run_after('setup') From cf9f0706fb34d2800ea6f7e0ecb8d574f3d3636f Mon Sep 17 00:00:00 2001 From: Davide Grassano <34096612+Crivella@users.noreply.github.com> Date: Thu, 10 Oct 2024 15:45:19 +0200 Subject: [PATCH 08/11] Apply suggestions from code review Co-authored-by: Caspar van Leeuwen <33718780+casparvl@users.noreply.github.com> --- eessi/testsuite/tests/apps/MetalWalls.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eessi/testsuite/tests/apps/MetalWalls.py b/eessi/testsuite/tests/apps/MetalWalls.py index 989cc92b..35fcba10 100644 --- a/eessi/testsuite/tests/apps/MetalWalls.py +++ b/eessi/testsuite/tests/apps/MetalWalls.py @@ -109,11 +109,11 @@ def request_mem(self): @run_after('setup') def skip_max_corecnt(self): """Skip tests if number of tasks per node exceeds maximum core count.""" - max_corecnt = 256 + max_task_cnt = 256 bench_name = self.benchmark_info[0] self.skip_if( self.num_tasks > max_corecnt, - f'Number of tasks per node {self.num_tasks} exceeds maximum core count {max_corecnt} for {bench_name}' + f'Number of tasks {self.num_tasks} exceeds maximum task count {max_task_cnt} for {bench_name}' ) @run_after('setup') From c2c3f2a7606e989c58b827265ad4671710b54b4a Mon Sep 17 00:00:00 2001 From: crivella Date: Thu, 10 Oct 2024 15:47:12 +0200 Subject: [PATCH 09/11] Fixed var name --- eessi/testsuite/tests/apps/MetalWalls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi/testsuite/tests/apps/MetalWalls.py b/eessi/testsuite/tests/apps/MetalWalls.py index 35fcba10..3c29546f 100644 --- a/eessi/testsuite/tests/apps/MetalWalls.py +++ b/eessi/testsuite/tests/apps/MetalWalls.py @@ -112,7 +112,7 @@ def skip_max_corecnt(self): max_task_cnt = 256 bench_name = self.benchmark_info[0] self.skip_if( - self.num_tasks > max_corecnt, + self.num_tasks > max_task_cnt, f'Number of tasks {self.num_tasks} exceeds maximum task count {max_task_cnt} for {bench_name}' ) From 6b04d5308928fcd152efe3402ca480c1b20919ea Mon Sep 17 00:00:00 2001 From: crivella Date: Thu, 10 Oct 2024 16:40:09 +0200 Subject: [PATCH 10/11] Added increased time limit for low-core runs --- eessi/testsuite/tests/apps/MetalWalls.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/eessi/testsuite/tests/apps/MetalWalls.py b/eessi/testsuite/tests/apps/MetalWalls.py index 3c29546f..20c053eb 100644 --- a/eessi/testsuite/tests/apps/MetalWalls.py +++ b/eessi/testsuite/tests/apps/MetalWalls.py @@ -80,6 +80,14 @@ def set_tag_ci(self): self.tags.add(TAGS['CI']) log(f'tags set to {self.tags}') + @run_after('init') + def set_increased_walltime(self): + """Increase the amount of time for the largest benchmark, when running with few cores.""" + # List of benchmarks that require more time to run + large_benchmarks = ['hackathonGPU/benchmark2'] + if self.num_tasks <= 4 and self.benchmark_info[0] in large_benchmarks: + self.time_limit = '120m' + @run_after('setup') def run_after_setup(self): """Hooks to run after the setup phase""" From 68cff9a38e73f66dd06854ee20e258497650fe5a Mon Sep 17 00:00:00 2001 From: crivella Date: Fri, 18 Oct 2024 11:36:55 +0200 Subject: [PATCH 11/11] Increased base timeout to 60m --- eessi/testsuite/tests/apps/MetalWalls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi/testsuite/tests/apps/MetalWalls.py b/eessi/testsuite/tests/apps/MetalWalls.py index 20c053eb..cc4e9036 100644 --- a/eessi/testsuite/tests/apps/MetalWalls.py +++ b/eessi/testsuite/tests/apps/MetalWalls.py @@ -48,7 +48,7 @@ class EESSI_MetalWalls_MW(MetalWallsCheck): valid_systems = ['*'] valid_prog_environs = ['default'] - time_limit = '30m' + time_limit = '60m' module_name = parameter(find_modules('MetalWalls')) # For now, MetalWalls is being build for CPU targets only