[tuner] Add dispatch tuner example

nod-ai · Oct 4, 2024 · f5f7b4d · f5f7b4d
1 parent 10d9b58
commit f5f7b4d
Show file tree

Hide file tree

Showing 12 changed files with 268 additions and 11 deletions.
diff --git a/tuner/.gitignore b/tuner/.gitignore
@@ -2,4 +2,3 @@
 
 # Tuning artifacts
 tuning_*/
-
diff --git a/tuner/examples/dispatch/.gitignore b/tuner/examples/dispatch/.gitignore
@@ -0,0 +1,3 @@
+# Test files/dirs recommended by README.md.
+dump/
+benchmark.mlir
diff --git a/tuner/examples/dispatch/README.md b/tuner/examples/dispatch/README.md
@@ -0,0 +1,30 @@
+# Dispatch Tuner
+
+Allows to tune a signle dispatch in isolation.
+
+## Environments
+Follow instructions in [`/tuner/README.md`](../README.md)
+
+## Running the Dispatch Tuner
+
+### Generate a benchmark file
+Use the usual `iree-compile` command for your dispatch and add
+`--iree-hal-dump-executable-files-to=dump`. Copy the `*_benchmark.mlir` file
+to some temporary directory of choice. This will be the input to the dispatch tuner.
+
+### Recommended Trial Run
+For an initial trial to test the tuning loop, use:
+```shell
+python -m examples.dispatch benchmark.mlir --num-candidates=20
+```
+
+### Dry Run Test
+To perform a dry run (no GPU required), use:
+```shell
+python -m examples.dispatch benchmark.mlir --num-candidates=64 --num-model-candidates=10 --dry-run
+```
+
+### Basic Usage
+```shell
+python -m examples.dispatch benchmark.mlir
+```
diff --git a/tuner/examples/dispatch/__init__.py b/tuner/examples/dispatch/__init__.py
@@ -0,0 +1,5 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/tuner/examples/dispatch/__main__.py b/tuner/examples/dispatch/__main__.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from . import dispatch_tuner
+
+dispatch_tuner.main()
diff --git a/tuner/examples/dispatch/compile_dispatch.sh b/tuner/examples/dispatch/compile_dispatch.sh
@@ -0,0 +1,18 @@
+#! /usr/bin/env bash
+
+set -eou pipefail
+
+readonly INPUT="$1"
+readonly DIR="$(dirname "$INPUT")"
+readonly BASENAME="$(basename "$INPUT" .mlir)"
+readonly OUT="${DIR}/compiled/${BASENAME}.vmfb"
+
+iree-compile "$INPUT" -o "$OUT" \
+  --compile-from=executable-sources 2>/dev/null || (mv "$INPUT" "$DIR/failed" && exit 1)
+
+iree-dump-module "$OUT" | grep -q 'rocm-hsaco-fb' || (mv "$INPUT" "$DIR/failed" && rm -f "$OUT" && exit 1)
+if [ -f "${DIR}/${BASENAME}_config.mlir" ]; then
+    cat "${DIR}/../config_prolog.mlir" "${DIR}/${BASENAME}_config.mlir" "${DIR}/../config_epilog.mlir" > "${DIR}/specs/${BASENAME}_spec.mlir"
+fi
+
+echo "Compiling ${INPUT}: success"
diff --git a/tuner/examples/dispatch/config_epilog.mlir b/tuner/examples/dispatch/config_epilog.mlir
@@ -0,0 +1,12 @@
+
+//===----------------------------------------------------------------------===//
+// Entry point
+//===----------------------------------------------------------------------===//
+
+  transform.named_sequence @__kernel_config(%variant_op: !transform.any_op {transform.consumed}) {
+    transform.foreach_match in %variant_op
+        , @match_op -> @apply_op_config
+      : (!transform.any_op) -> (!transform.any_op)
+    transform.yield
+  }
+} ////  module
diff --git a/tuner/examples/dispatch/config_prolog.mlir b/tuner/examples/dispatch/config_prolog.mlir
@@ -0,0 +1,32 @@
+// Transform dialect specification for attention on MI300 with MFMA.
+module attributes { transform.with_named_sequence } {
+//===----------------------------------------------------------------------===//
+// Matmul tuning
+//===----------------------------------------------------------------------===//
+
+  transform.named_sequence @match_mmt_f16_f16_f32(%root: !transform.any_op {transform.readonly}) -> (!transform.any_op) {
+    transform.match.operation_name %root ["linalg.generic"] : !transform.any_op
+    // transform.print %root {name = "Generic"} : !transform.any_op
+    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %root {
+      ^bb0(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>, %out: tensor<?x?xf32>):
+      %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
+                                            affine_map<(d0, d1, d2) -> (d1, d2)>,
+                                            affine_map<(d0, d1, d2) -> (d0, d1)>],
+                           iterator_types = ["parallel", "parallel", "reduction"]}
+          ins(%lhs, %rhs : tensor<?x?xf16>, tensor<?x?xf16>) outs(%out : tensor<?x?xf32>) {
+        ^bb0(%in: f16, %in_0: f16, %acc: f32):
+          %8 = arith.extf %in : f16 to f32
+          %9 = arith.extf %in_0 : f16 to f32
+          %10 = arith.mulf %8, %9 : f32
+          %11 = arith.addf %acc, %10 : f32
+          linalg.yield %11 : f32
+        } -> tensor<?x?xf32>
+    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
+    transform.yield %root : !transform.any_op
+  }
+
+  transform.named_sequence @apply_op_config(%op: !transform.any_op {transform.readonly}, %config: !transform.any_param {transform.readonly}) {
+    transform.annotate %op "compilation_info" = %config : !transform.any_op, !transform.any_param
+    // transform.print %op {name = "Applied"} : !transform.any_op
+    transform.yield
+  }
diff --git a/tuner/examples/dispatch/dispatch_tuner.py b/tuner/examples/dispatch/dispatch_tuner.py
@@ -0,0 +1,137 @@
+# Copyright 2024 Advanced Micro Devices, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""
+Sample Usage:
+
+python -m examples.gemm benchmark.mlir --lhs-dims=bmk --rhs-dims=bkn --tile-dims=*mnk --devices=hip://0,hip://1 --num-candidates=64
+
+
+Recommended Trial Run:
+
+python -m examples.gemm benchmark.mlir --num-candidates=10
+
+
+Dry Run Test (no gpu required):
+
+python -m examples.gemm benchmark.mlir --num-candidates=64 --dry-run
+
+"""
+
+from tuner import libtuner
+from pathlib import Path, PurePath
+import os
+
+
+class DispatchTuner(libtuner.TuningClient):
+    def get_dispatch_compile_timeout_s(self) -> int:
+        return 10
+
+    def get_dispatch_compile_command(
+        self, candidate_tracker: libtuner.CandidateTracker
+    ) -> list[str]:
+        assert candidate_tracker.dispatch_mlir_path is not None
+        mlir_path: Path = candidate_tracker.dispatch_mlir_path
+        script_dir = Path(__file__).resolve().parent
+        command = [
+            (script_dir / "compile_dispatch.sh").as_posix(),
+            mlir_path.as_posix(),
+        ]
+        return command
+
+    def get_dispatch_benchmark_timeout_s(self) -> int:
+        return 15
+
+    def get_dispatch_benchmark_command(
+        self,
+        candidate_tracker: libtuner.CandidateTracker,
+    ) -> list[str]:
+        compiled_vmfb_path = candidate_tracker.compiled_dispatch_path
+        assert compiled_vmfb_path is not None
+
+        command = [
+            "iree-benchmark-module",
+            f"--device={libtuner.DEVICE_ID_PLACEHOLDER}",
+            f"--module={compiled_vmfb_path.resolve()}",
+            "--batch_size=1000",
+            "--benchmark_repetitions=3",
+        ]
+
+        return command
+
+    def get_model_compile_timeout_s(self) -> int:
+        return 0
+
+    def get_model_compile_command(
+        self, candidate_tracker: libtuner.CandidateTracker
+    ) -> list[str]:
+        return []
+
+    def get_model_benchmark_timeout_s(self) -> int:
+        return 0
+
+    def get_model_benchmark_command(
+        self, candidate_tracker: libtuner.CandidateTracker
+    ) -> list[str]:
+        return []
+
+
+def main():
+    args = libtuner.parse_arguments()
+    path_config = libtuner.PathConfig()
+    # These will not be used, so always default to the empty config in the script dir.
+    script_dir = Path(__file__).resolve().parent
+    path_config.global_config_prolog_mlir = (
+        script_dir / path_config.global_config_prolog_mlir
+    )
+    path_config.global_config_epilog_mlir = (
+        script_dir / path_config.global_config_epilog_mlir
+    )
+    path_config.base_dir.mkdir(parents=True, exist_ok=True)
+    path_config.output_unilog.touch()
+    candidate_trackers: list[libtuner.CandidateTracker] = []
+    dispatch_tuner = DispatchTuner()
+    stop_after_phase: str = args.stop_after
+
+    print("Setup logging")
+    libtuner.setup_logging(args, path_config)
+    print(path_config.run_log, end="\n\n")
+
+    if not args.dry_run:
+        print("Validating devices")
+        libtuner.validate_devices(args.devices)
+        print("Validation successful!\n")
+
+    print("Generating candidates...")
+    candidates = libtuner.generate_candidates(args, path_config, candidate_trackers)
+    print(f"Stored candidates in {path_config.candidates_dir}\n")
+    if stop_after_phase == libtuner.ExecutionPhases.generate_candidates:
+        return
+
+    print("Compiling candidates...")
+    compiled_candidates = libtuner.compile_dispatches(
+        args, path_config, candidates, candidate_trackers, dispatch_tuner
+    )
+    print(f"Compiled files are stored in {path_config.compiled_dir}\n")
+    if stop_after_phase == libtuner.ExecutionPhases.compile_dispatches:
+        return
+
+    print("Benchmarking compiled candidates...")
+    top_candidates = libtuner.benchmark_dispatches(
+        args, path_config, compiled_candidates, candidate_trackers, dispatch_tuner
+    )
+    print(f"\nStored results in {path_config.output_unilog.resolve()}\n")
+    if stop_after_phase == libtuner.ExecutionPhases.benchmark_dispatches:
+        return
+
+    libtuner.save_pickle(path_config.candidate_trackers_pkl, candidate_trackers)
+    print(f"Candidate trackers are saved in {path_config.candidate_trackers_pkl}\n")
+
+    print("Check the detailed execution logs in:")
+    print(path_config.run_log.resolve())
+
+    for candidate in candidate_trackers:
+        libtuner.logging.debug(candidate)
diff --git a/tuner/examples/dispatch/mmt.mlir b/tuner/examples/dispatch/mmt.mlir
@@ -0,0 +1,11 @@
+!matA_0 = tensor<2048x1280xf16>
+!matB_0 = tensor<10240x1280xf16>
+!matC_0 = tensor<2048x10240xf32>
+
+func.func @main_0(%arg0: !matA_0, %arg1: !matB_0) -> !matC_0 {
+  %cst = arith.constant 0.000000e+00 : f16
+  %5 = tensor.empty() : !matC_0
+  %6 = linalg.fill ins(%cst : f16) outs(%5 : !matC_0) -> !matC_0
+  %8 = linalg.matmul_transpose_b ins(%arg0, %arg1 : !matA_0, !matB_0) outs(%6 : !matC_0) -> !matC_0
+  return %8 : !matC_0
+}
diff --git a/tuner/examples/punet/README.md b/tuner/examples/punet/README.md
@@ -31,16 +31,16 @@ cp ./dump-mmt/module_main_0_dispatch_0_rocm_hsaco_fb_benchmark.mlir test-benchma
 ### Recommended Trial Run
 For an initial trial to test the tuning loop, use:
 ```shell
-python -m tuner.examples.punet test-benchmark.mlir --num-candidates=10
+python -m examples.punet test-benchmark.mlir --num-candidates=10
 ```
 
 ### Dry Run Test
 To perform a dry run (no GPU required), use:
 ```shell
-python -m tuner.examples.punet test-benchmark.mlir --num-candidates=64 --num-model-candidates=10 --dry-run
+python -m examples.punet test-benchmark.mlir --num-candidates=64 --num-model-candidates=10 --dry-run
 ```
 
 ### Basic Usage
 ```shell
-python -m tuner.examples.punet test-benchmark.mlir
+python -m examples.punet test-benchmark.mlir
 ```
diff --git a/tuner/tuner/libtuner.py b/tuner/tuner/libtuner.py
@@ -78,12 +78,12 @@ class CandidateTracker:
     calibrated_benchmark_diff: Optional[float] = None
 
 
-@dataclass(frozen=True)
+@dataclass()
 class PathConfig:
     # Preset constants
-    global_config_prolog_mlir: Path = Path("./config_prolog.mlir")
-    global_config_epilog_mlir: Path = Path("./config_epilog.mlir")
-    model_baseline_vmfb: Path = Path("./baseline.vmfb")
+    global_config_prolog_mlir: Path = Path("config_prolog.mlir")
+    global_config_epilog_mlir: Path = Path("config_epilog.mlir")
+    model_baseline_vmfb: Path = Path("baseline.vmfb")
 
     # Dynamic paths
     base_dir: Path = field(init=False)
@@ -523,7 +523,7 @@ def create_worker_context_queue(device_ids: list[int]) -> queue.Queue[tuple[int,
 def run_command(run_pack: RunPack) -> TaskResult:
     command = run_pack.command
     check = run_pack.check
-    timeout_seconds = run_pack.timeout
+    timeout_seconds = run_pack.timeout_seconds
 
     result = None
     is_timeout = False
@@ -828,7 +828,7 @@ def compile_dispatches(
         num_worker=num_worker, task_list=task_list, function=run_command_wrapper
     )
 
-    # Note: failed/incompleted candidates can also be detected by checking if subprocess.res is None
+    # Note: failed/incomplete candidates can also be detected by checking if subprocess.res is None
     compiled_files = sorted(
         path_config.compiled_dir.glob("*.vmfb"), key=numerical_sort_key
     )
@@ -860,7 +860,8 @@ def compile_dispatches(
         compiled_candidates_hash_list.append((index, hash_val))
 
     handle_error(
-        condition=(good == 0), msg="Failed to compile all candidate .mlir files"
+        condition=(good == 0),
+        msg="All candidate dispatches .mlir files failed to compile",
     )
     handle_error(
         condition=(compiling_rate < 10),