diff --git a/tuner/.gitignore b/tuner/.gitignore index 10842e542..94d12c89f 100644 --- a/tuner/.gitignore +++ b/tuner/.gitignore @@ -2,4 +2,3 @@ # Tuning artifacts tuning_*/ - diff --git a/tuner/examples/dispatch/.gitignore b/tuner/examples/dispatch/.gitignore new file mode 100644 index 000000000..9fb2fe16a --- /dev/null +++ b/tuner/examples/dispatch/.gitignore @@ -0,0 +1,3 @@ +# Test files/dirs recommended by README.md. +dump/ +benchmark.mlir diff --git a/tuner/examples/dispatch/README.md b/tuner/examples/dispatch/README.md new file mode 100644 index 000000000..932745f75 --- /dev/null +++ b/tuner/examples/dispatch/README.md @@ -0,0 +1,30 @@ +# Dispatch Tuner + +Allows to tune a signle dispatch in isolation. + +## Environments +Follow instructions in [`/tuner/README.md`](../README.md) + +## Running the Dispatch Tuner + +### Generate a benchmark file +Use the usual `iree-compile` command for your dispatch and add +`--iree-hal-dump-executable-files-to=dump`. Copy the `*_benchmark.mlir` file +to some temporary directory of choice. This will be the input to the dispatch tuner. + +### Recommended Trial Run +For an initial trial to test the tuning loop, use: +```shell +python -m examples.dispatch benchmark.mlir --num-candidates=20 +``` + +### Dry Run Test +To perform a dry run (no GPU required), use: +```shell +python -m examples.dispatch benchmark.mlir --num-candidates=64 --num-model-candidates=10 --dry-run +``` + +### Basic Usage +```shell +python -m examples.dispatch benchmark.mlir +``` diff --git a/tuner/examples/dispatch/__init__.py b/tuner/examples/dispatch/__init__.py new file mode 100644 index 000000000..a85ba359d --- /dev/null +++ b/tuner/examples/dispatch/__init__.py @@ -0,0 +1,5 @@ +# Copyright 2024 Advanced Micro Devices, Inc. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception diff --git a/tuner/examples/dispatch/__main__.py b/tuner/examples/dispatch/__main__.py new file mode 100644 index 000000000..9fb86fd9f --- /dev/null +++ b/tuner/examples/dispatch/__main__.py @@ -0,0 +1,9 @@ +# Copyright 2024 Advanced Micro Devices, Inc. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +from . import dispatch_tuner + +dispatch_tuner.main() diff --git a/tuner/examples/dispatch/compile_dispatch.sh b/tuner/examples/dispatch/compile_dispatch.sh new file mode 100755 index 000000000..0b01ac991 --- /dev/null +++ b/tuner/examples/dispatch/compile_dispatch.sh @@ -0,0 +1,18 @@ +#! /usr/bin/env bash + +set -eou pipefail + +readonly INPUT="$1" +readonly DIR="$(dirname "$INPUT")" +readonly BASENAME="$(basename "$INPUT" .mlir)" +readonly OUT="${DIR}/compiled/${BASENAME}.vmfb" + +iree-compile "$INPUT" -o "$OUT" \ + --compile-from=executable-sources 2>/dev/null || (mv "$INPUT" "$DIR/failed" && exit 1) + +iree-dump-module "$OUT" | grep -q 'rocm-hsaco-fb' || (mv "$INPUT" "$DIR/failed" && rm -f "$OUT" && exit 1) +if [ -f "${DIR}/${BASENAME}_config.mlir" ]; then + cat "${DIR}/../config_prolog.mlir" "${DIR}/${BASENAME}_config.mlir" "${DIR}/../config_epilog.mlir" > "${DIR}/specs/${BASENAME}_spec.mlir" +fi + +echo "Compiling ${INPUT}: success" diff --git a/tuner/examples/dispatch/config_epilog.mlir b/tuner/examples/dispatch/config_epilog.mlir new file mode 100644 index 000000000..c15a30502 --- /dev/null +++ b/tuner/examples/dispatch/config_epilog.mlir @@ -0,0 +1,12 @@ + +//===----------------------------------------------------------------------===// +// Entry point +//===----------------------------------------------------------------------===// + + transform.named_sequence @__kernel_config(%variant_op: !transform.any_op {transform.consumed}) { + transform.foreach_match in %variant_op + , @match_op -> @apply_op_config + : (!transform.any_op) -> (!transform.any_op) + transform.yield + } +} //// module diff --git a/tuner/examples/dispatch/config_prolog.mlir b/tuner/examples/dispatch/config_prolog.mlir new file mode 100644 index 000000000..377ac3f8f --- /dev/null +++ b/tuner/examples/dispatch/config_prolog.mlir @@ -0,0 +1,32 @@ +// Transform dialect specification for attention on MI300 with MFMA. +module attributes { transform.with_named_sequence } { +//===----------------------------------------------------------------------===// +// Matmul tuning +//===----------------------------------------------------------------------===// + + transform.named_sequence @match_mmt_f16_f16_f32(%root: !transform.any_op {transform.readonly}) -> (!transform.any_op) { + transform.match.operation_name %root ["linalg.generic"] : !transform.any_op + // transform.print %root {name = "Generic"} : !transform.any_op + %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %root { + ^bb0(%lhs: tensor, %rhs: tensor, %out: tensor): + %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, + affine_map<(d0, d1, d2) -> (d1, d2)>, + affine_map<(d0, d1, d2) -> (d0, d1)>], + iterator_types = ["parallel", "parallel", "reduction"]} + ins(%lhs, %rhs : tensor, tensor) outs(%out : tensor) { + ^bb0(%in: f16, %in_0: f16, %acc: f32): + %8 = arith.extf %in : f16 to f32 + %9 = arith.extf %in_0 : f16 to f32 + %10 = arith.mulf %8, %9 : f32 + %11 = arith.addf %acc, %10 : f32 + linalg.yield %11 : f32 + } -> tensor + } : (!transform.any_op) -> (!transform.any_value, !transform.any_value) + transform.yield %root : !transform.any_op + } + + transform.named_sequence @apply_op_config(%op: !transform.any_op {transform.readonly}, %config: !transform.any_param {transform.readonly}) { + transform.annotate %op "compilation_info" = %config : !transform.any_op, !transform.any_param + // transform.print %op {name = "Applied"} : !transform.any_op + transform.yield + } diff --git a/tuner/examples/dispatch/dispatch_tuner.py b/tuner/examples/dispatch/dispatch_tuner.py new file mode 100644 index 000000000..e5491d06d --- /dev/null +++ b/tuner/examples/dispatch/dispatch_tuner.py @@ -0,0 +1,137 @@ +# Copyright 2024 Advanced Micro Devices, Inc +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +""" +Sample Usage: + +python -m examples.gemm benchmark.mlir --lhs-dims=bmk --rhs-dims=bkn --tile-dims=*mnk --devices=hip://0,hip://1 --num-candidates=64 + + +Recommended Trial Run: + +python -m examples.gemm benchmark.mlir --num-candidates=10 + + +Dry Run Test (no gpu required): + +python -m examples.gemm benchmark.mlir --num-candidates=64 --dry-run + +""" + +from tuner import libtuner +from pathlib import Path, PurePath +import os + + +class DispatchTuner(libtuner.TuningClient): + def get_dispatch_compile_timeout_s(self) -> int: + return 10 + + def get_dispatch_compile_command( + self, candidate_tracker: libtuner.CandidateTracker + ) -> list[str]: + assert candidate_tracker.dispatch_mlir_path is not None + mlir_path: Path = candidate_tracker.dispatch_mlir_path + script_dir = Path(__file__).resolve().parent + command = [ + (script_dir / "compile_dispatch.sh").as_posix(), + mlir_path.as_posix(), + ] + return command + + def get_dispatch_benchmark_timeout_s(self) -> int: + return 15 + + def get_dispatch_benchmark_command( + self, + candidate_tracker: libtuner.CandidateTracker, + ) -> list[str]: + compiled_vmfb_path = candidate_tracker.compiled_dispatch_path + assert compiled_vmfb_path is not None + + command = [ + "iree-benchmark-module", + f"--device={libtuner.DEVICE_ID_PLACEHOLDER}", + f"--module={compiled_vmfb_path.resolve()}", + "--batch_size=1000", + "--benchmark_repetitions=3", + ] + + return command + + def get_model_compile_timeout_s(self) -> int: + return 0 + + def get_model_compile_command( + self, candidate_tracker: libtuner.CandidateTracker + ) -> list[str]: + return [] + + def get_model_benchmark_timeout_s(self) -> int: + return 0 + + def get_model_benchmark_command( + self, candidate_tracker: libtuner.CandidateTracker + ) -> list[str]: + return [] + + +def main(): + args = libtuner.parse_arguments() + path_config = libtuner.PathConfig() + # These will not be used, so always default to the empty config in the script dir. + script_dir = Path(__file__).resolve().parent + path_config.global_config_prolog_mlir = ( + script_dir / path_config.global_config_prolog_mlir + ) + path_config.global_config_epilog_mlir = ( + script_dir / path_config.global_config_epilog_mlir + ) + path_config.base_dir.mkdir(parents=True, exist_ok=True) + path_config.output_unilog.touch() + candidate_trackers: list[libtuner.CandidateTracker] = [] + dispatch_tuner = DispatchTuner() + stop_after_phase: str = args.stop_after + + print("Setup logging") + libtuner.setup_logging(args, path_config) + print(path_config.run_log, end="\n\n") + + if not args.dry_run: + print("Validating devices") + libtuner.validate_devices(args.devices) + print("Validation successful!\n") + + print("Generating candidates...") + candidates = libtuner.generate_candidates(args, path_config, candidate_trackers) + print(f"Stored candidates in {path_config.candidates_dir}\n") + if stop_after_phase == libtuner.ExecutionPhases.generate_candidates: + return + + print("Compiling candidates...") + compiled_candidates = libtuner.compile_dispatches( + args, path_config, candidates, candidate_trackers, dispatch_tuner + ) + print(f"Compiled files are stored in {path_config.compiled_dir}\n") + if stop_after_phase == libtuner.ExecutionPhases.compile_dispatches: + return + + print("Benchmarking compiled candidates...") + top_candidates = libtuner.benchmark_dispatches( + args, path_config, compiled_candidates, candidate_trackers, dispatch_tuner + ) + print(f"\nStored results in {path_config.output_unilog.resolve()}\n") + if stop_after_phase == libtuner.ExecutionPhases.benchmark_dispatches: + return + + libtuner.save_pickle(path_config.candidate_trackers_pkl, candidate_trackers) + print(f"Candidate trackers are saved in {path_config.candidate_trackers_pkl}\n") + + print("Check the detailed execution logs in:") + print(path_config.run_log.resolve()) + + for candidate in candidate_trackers: + libtuner.logging.debug(candidate) diff --git a/tuner/examples/dispatch/mmt.mlir b/tuner/examples/dispatch/mmt.mlir new file mode 100644 index 000000000..b9d6c5f4c --- /dev/null +++ b/tuner/examples/dispatch/mmt.mlir @@ -0,0 +1,11 @@ +!matA_0 = tensor<2048x1280xf16> +!matB_0 = tensor<10240x1280xf16> +!matC_0 = tensor<2048x10240xf32> + +func.func @main_0(%arg0: !matA_0, %arg1: !matB_0) -> !matC_0 { + %cst = arith.constant 0.000000e+00 : f16 + %5 = tensor.empty() : !matC_0 + %6 = linalg.fill ins(%cst : f16) outs(%5 : !matC_0) -> !matC_0 + %8 = linalg.matmul_transpose_b ins(%arg0, %arg1 : !matA_0, !matB_0) outs(%6 : !matC_0) -> !matC_0 + return %8 : !matC_0 +} diff --git a/tuner/examples/punet/README.md b/tuner/examples/punet/README.md index 012e86fcb..777d1c194 100644 --- a/tuner/examples/punet/README.md +++ b/tuner/examples/punet/README.md @@ -31,16 +31,16 @@ cp ./dump-mmt/module_main_0_dispatch_0_rocm_hsaco_fb_benchmark.mlir test-benchma ### Recommended Trial Run For an initial trial to test the tuning loop, use: ```shell -python -m tuner.examples.punet test-benchmark.mlir --num-candidates=10 +python -m examples.punet test-benchmark.mlir --num-candidates=10 ``` ### Dry Run Test To perform a dry run (no GPU required), use: ```shell -python -m tuner.examples.punet test-benchmark.mlir --num-candidates=64 --num-model-candidates=10 --dry-run +python -m examples.punet test-benchmark.mlir --num-candidates=64 --num-model-candidates=10 --dry-run ``` ### Basic Usage ```shell -python -m tuner.examples.punet test-benchmark.mlir +python -m examples.punet test-benchmark.mlir ``` diff --git a/tuner/tuner/libtuner.py b/tuner/tuner/libtuner.py index 396b535f1..30ce732bd 100644 --- a/tuner/tuner/libtuner.py +++ b/tuner/tuner/libtuner.py @@ -78,12 +78,12 @@ class CandidateTracker: calibrated_benchmark_diff: Optional[float] = None -@dataclass(frozen=True) +@dataclass() class PathConfig: # Preset constants - global_config_prolog_mlir: Path = Path("./config_prolog.mlir") - global_config_epilog_mlir: Path = Path("./config_epilog.mlir") - model_baseline_vmfb: Path = Path("./baseline.vmfb") + global_config_prolog_mlir: Path = Path("config_prolog.mlir") + global_config_epilog_mlir: Path = Path("config_epilog.mlir") + model_baseline_vmfb: Path = Path("baseline.vmfb") # Dynamic paths base_dir: Path = field(init=False) @@ -523,7 +523,7 @@ def create_worker_context_queue(device_ids: list[int]) -> queue.Queue[tuple[int, def run_command(run_pack: RunPack) -> TaskResult: command = run_pack.command check = run_pack.check - timeout_seconds = run_pack.timeout + timeout_seconds = run_pack.timeout_seconds result = None is_timeout = False @@ -828,7 +828,7 @@ def compile_dispatches( num_worker=num_worker, task_list=task_list, function=run_command_wrapper ) - # Note: failed/incompleted candidates can also be detected by checking if subprocess.res is None + # Note: failed/incomplete candidates can also be detected by checking if subprocess.res is None compiled_files = sorted( path_config.compiled_dir.glob("*.vmfb"), key=numerical_sort_key ) @@ -860,7 +860,8 @@ def compile_dispatches( compiled_candidates_hash_list.append((index, hash_val)) handle_error( - condition=(good == 0), msg="Failed to compile all candidate .mlir files" + condition=(good == 0), + msg="All candidate dispatches .mlir files failed to compile", ) handle_error( condition=(compiling_rate < 10),