Skip to content

Commit

Permalink
[tuner] Update gpu pipeline option handling
Browse files Browse the repository at this point in the history
Use the attribute format introduced in
iree-org/iree#18458.

Fixes: #186
  • Loading branch information
kuhar committed Oct 16, 2024
1 parent 6b90ac7 commit e7cce28
Show file tree
Hide file tree
Showing 2 changed files with 131 additions and 21 deletions.
60 changes: 53 additions & 7 deletions tuner/tuner/candidate_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@
import pickle
import re
import z3
from dataclasses import asdict, dataclass
from dataclasses import astuple, dataclass
from enum import Enum
from os import mkdir, path, makedirs
from typing import Callable, Optional
from typing import Optional
from textwrap import indent
from abc import ABC, abstractmethod

Expand Down Expand Up @@ -148,6 +148,44 @@ def all():
]


class ReorderWorkgroupsStrategy(Enum):
NONE = 0
SWIZZLE = 1
TRANSPOSE = 2

def __str__(self) -> str:
return self.name.title()


@dataclass
class GpuPipelineOptions:
"""Represents the `iree_gpu.pipeline_options` attribute"""

prefetch_shared_memory: Optional[bool] = None
no_reduce_shared_memory_bank_conflicts: Optional[bool] = None
reorder_workgroups_strategy: Optional[ReorderWorkgroupsStrategy] = None

def all_default(self) -> bool:
return all(x is None for x in astuple(self))

def __str__(self) -> str:
options: list[str] = []
if self.prefetch_shared_memory is not None:
options.append(
f"prefetch_shared_memory = {str(self.prefetch_shared_memory).lower()}"
)
if self.no_reduce_shared_memory_bank_conflicts is not None:
options.append(
f"no_reduce_shared_memory_bank_conflicts = {str(self.no_reduce_shared_memory_bank_conflicts).lower()}"
)
if self.reorder_workgroups_strategy is not None:
options.append(
f"reorder_workgroups_strategy = {self.reorder_workgroups_strategy}"
)

return f"#iree_gpu.pipeline_options<{', '.join(options)}>"


@dataclass
class Configuration:
subgroup_size: int
Expand All @@ -156,6 +194,7 @@ class Configuration:
tile_sizes: list[int]
subgroup_m_count: int
subgroup_n_count: int
gpu_pipeline_options: GpuPipelineOptions
waves_per_eu: int


Expand Down Expand Up @@ -223,7 +262,9 @@ def get_batch_mmt_tile_sizes(configuration: Configuration) -> list[int]:


def get_pipeline_config(configuration: Configuration) -> str:
extra_config = ", prefetch_shared_memory"
extra_config = ""
if not configuration.gpu_pipeline_options.all_default():
extra_config += f", gpu_pipeline_options = {configuration.gpu_pipeline_options}"
if configuration.waves_per_eu != 2:
extra_config += f', llvm_func_attrs = {{"amdgpu-waves-per-eu" = "{configuration.waves_per_eu}"}}'
return extra_config
Expand All @@ -234,17 +275,19 @@ def apply_configuration(
) -> str:
tune_logger.info(f"Applying: {configuration}")
expr0 = re.compile(
r"<intrinsic = #iree_gpu.mma_layout<(.+)>, subgroup_m_count = ([0-9]+), subgroup_n_count = ([0-9]+)>"
r"<intrinsic = #iree_gpu\.mma_layout<(.+)>, subgroup_m_count = ([0-9]+), subgroup_n_count = ([0-9]+)>"
)
expr1 = re.compile(
r"LLVMGPUVectorDistribute workgroup_size = \[.+\] subgroup_size = ([0-9]+),"
)
expr2 = re.compile(r"tile_sizes = \[\[([0-9]+)(, ([0-9]+))+\]\]")
expr3 = re.compile(r"\"amdgpu-waves-per-eu\" = \"([0-9])\"")
expr3 = re.compile(r"gpu_pipeline_options = #iree_gpu\.pipeline_options<([^>]*)>")
expr4 = re.compile(r"\"amdgpu-waves-per-eu\" = \"([0-9])\"")
repl0 = f"<intrinsic = #iree_gpu.mma_layout<{configuration.intrinsic}>, subgroup_m_count = {configuration.subgroup_m_count}, subgroup_n_count = {configuration.subgroup_n_count}>"
repl1 = f'LLVMGPUVectorDistribute workgroup_size = [{", ".join(map(str, configuration.workgroup_size))}] subgroup_size = {configuration.subgroup_size},'
repl2 = f'tile_sizes = [[{", ".join(map(str, tile_sizes))}]]'
repl3 = f'"amdgpu-waves-per-eu" = "{configuration.waves_per_eu}"'
repl3 = f"gpu_pipeline_options = {configuration.gpu_pipeline_options}"
repl4 = f'"amdgpu-waves-per-eu" = "{configuration.waves_per_eu}"'

new_mlir = ""
for line in template:
Expand All @@ -254,8 +297,10 @@ def apply_configuration(
line = re.sub(expr1, repl1, line)
if "tile_sizes" in line:
line = re.sub(expr2, repl2, line)
if "amdgpu-waves-per-eu" in line:
if "gpu_pipeline_options =" in line:
line = re.sub(expr3, repl3, line)
if "amdgpu-waves-per-eu" in line:
line = re.sub(expr4, repl4, line)
new_mlir += line

return new_mlir
Expand Down Expand Up @@ -461,6 +506,7 @@ def generate_solutions(problem_size: ProblemSize, num_subgrups: int):
[lookup(m), lookup(n), lookup(k)],
lookup(sg_m_cnt),
lookup(sg_n_cnt),
GpuPipelineOptions(),
lookup(waves_per_eu),
)
solver.add(z3.simplify(z3.Not(z3.And(list(x == model[x] for x in all_vars)))))
Expand Down
92 changes: 78 additions & 14 deletions tuner/tuner/candidate_gen_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def test_get_mmt_tile_sizes():
tile_sizes=[128, 320, 32],
subgroup_m_count=0,
subgroup_n_count=0,
gpu_pipeline_options=candidate_gen.GpuPipelineOptions(),
waves_per_eu=0,
)
assert candidate_gen.get_mmt_tile_sizes(config) == [128, 320, 32]
Expand All @@ -80,6 +81,7 @@ def test_get_conv_tile_sizes():
tile_sizes=[464, 320, 16],
subgroup_m_count=1,
subgroup_n_count=4,
gpu_pipeline_options=candidate_gen.GpuPipelineOptions(),
waves_per_eu=1,
)
assert candidate_gen.ConvTuner().get_conv_tile_sizes(config) == [
Expand All @@ -93,6 +95,32 @@ def test_get_conv_tile_sizes():
]


def test_gpu_pipeline_options():
options = candidate_gen.GpuPipelineOptions()
assert options.all_default()
assert str(options) == "#iree_gpu.pipeline_options<>"

options.prefetch_shared_memory = True
assert not options.all_default()
assert str(options) == "#iree_gpu.pipeline_options<prefetch_shared_memory = true>"

options.no_reduce_shared_memory_bank_conflicts = False
assert (
str(options)
== "#iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>"
)

options = candidate_gen.GpuPipelineOptions()
options.reorder_workgroups_strategy = (
candidate_gen.ReorderWorkgroupsStrategy.TRANSPOSE
)
assert not options.all_default()
assert (
str(options)
== "#iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose>"
)


def test_get_contract_tile_sizes():
config = candidate_gen.Configuration(
subgroup_size=32,
Expand All @@ -101,6 +129,7 @@ def test_get_contract_tile_sizes():
tile_sizes=[4, 8, 16],
subgroup_m_count=1,
subgroup_n_count=1,
gpu_pipeline_options=candidate_gen.GpuPipelineOptions(),
waves_per_eu=2,
)
assert candidate_gen.get_contract_tile_sizes(config, ["m", "n", "k"]) == [4, 8, 16]
Expand All @@ -114,28 +143,28 @@ def test_get_contract_tile_sizes():


def test_get_pipeline_config():
config1 = candidate_gen.Configuration(
config = candidate_gen.Configuration(
subgroup_size=32,
workgroup_size=[16, 16, 1],
intrinsic="",
tile_sizes=[4, 8, 16],
subgroup_m_count=1,
subgroup_n_count=1,
gpu_pipeline_options=candidate_gen.GpuPipelineOptions(),
waves_per_eu=2,
)
config2 = candidate_gen.Configuration(
subgroup_size=32,
workgroup_size=[16, 16, 1],
intrinsic="",
tile_sizes=[4, 8, 16],
subgroup_m_count=1,
subgroup_n_count=1,
waves_per_eu=4,
)
assert candidate_gen.get_pipeline_config(config1) == ", prefetch_shared_memory"
config1_str: str = candidate_gen.get_pipeline_config(config)
assert config1_str == ""

config.waves_per_eu = 4
config2_str: str = candidate_gen.get_pipeline_config(config)
assert config2_str == ', llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}'

config.gpu_pipeline_options.prefetch_shared_memory = True
config3_str = candidate_gen.get_pipeline_config(config)
assert (
candidate_gen.get_pipeline_config(config2)
== ', prefetch_shared_memory, llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}'
config3_str
== ', gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>, llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}'
)


Expand Down Expand Up @@ -409,11 +438,18 @@ def test_generate_constraints_invalid_input():
assert solver.check() == candidate_gen.z3.unsat


def remove_comments(mlir: str) -> str:
return "\n".join(
filter(lambda x: not x.lstrip().startswith("//"), mlir.splitlines())
)


def test_apply_params_mmt():
mlir_template = [
"<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 16, subgroup_n_count = 16>",
"<LLVMGPUVectorDistribute workgroup_size = [16, 16] subgroup_size = 16,",
"<tile_sizes = [[8, 8, 8]]>",
"gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = None>",
'{llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}',
]

Expand All @@ -426,6 +462,9 @@ def test_apply_params_mmt():
tile_sizes=[8, 8, 8],
subgroup_m_count=16,
subgroup_n_count=16,
gpu_pipeline_options=candidate_gen.GpuPipelineOptions(
prefetch_shared_memory=True
),
waves_per_eu=8,
)

Expand All @@ -442,6 +481,7 @@ def test_apply_params_mmt():
embeddable = tf_mlir.embeddable

assert modified
modified = remove_comments(modified)
assert embeddable
assert (
"intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 16, subgroup_n_count = 16"
Expand All @@ -452,6 +492,10 @@ def test_apply_params_mmt():
in modified
)
assert "tile_sizes = [[8, 8, 8]]" in modified
assert (
"gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>"
in modified
)
assert '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "8"}' in modified


Expand All @@ -460,7 +504,7 @@ def test_apply_params_conv():
"<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 16, subgroup_n_count = 16>",
"<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64,",
"<tile_sizes = [[1, 1, 64, 128, 1, 1, 32]]>",
'{llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}',
'gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>, {llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}',
]

n, oh, ow, oc, fh, fw, ic = 2, 64, 64, 640, 3, 3, 640
Expand All @@ -472,6 +516,9 @@ def test_apply_params_conv():
tile_sizes=[464, 320, 16],
subgroup_m_count=1,
subgroup_n_count=4,
gpu_pipeline_options=candidate_gen.GpuPipelineOptions(
reorder_workgroups_strategy=candidate_gen.ReorderWorkgroupsStrategy.TRANSPOSE
),
waves_per_eu=2,
)

Expand All @@ -492,6 +539,8 @@ def test_apply_params_conv():
embeddable = tf_mlir.embeddable

assert modified
modified = remove_comments(modified)

assert embeddable
assert (
"intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 1, subgroup_n_count = 4"
Expand All @@ -502,6 +551,10 @@ def test_apply_params_conv():
in modified
)
assert "tile_sizes = [[1, 1, 464, 320, 1, 1, 16]]" in modified
assert (
"gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose>"
in modified
)
assert '{llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}' in modified


Expand Down Expand Up @@ -529,6 +582,7 @@ def test_apply_params_contract():
tile_sizes=[480, 384, 32],
subgroup_m_count=1,
subgroup_n_count=4,
gpu_pipeline_options=candidate_gen.GpuPipelineOptions(),
waves_per_eu=2,
)

Expand Down Expand Up @@ -575,6 +629,7 @@ def test_apply_params_batch_matmul():
tile_sizes=[416, 320, 128],
subgroup_m_count=2,
subgroup_n_count=2,
gpu_pipeline_options=candidate_gen.GpuPipelineOptions(),
waves_per_eu=2,
)

Expand All @@ -586,6 +641,8 @@ def test_apply_params_batch_matmul():
embeddable = tf_mlir.embeddable

assert modified
modified = remove_comments(modified)

assert embeddable
assert (
"intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>, subgroup_m_count = 2, subgroup_n_count = 2"
Expand Down Expand Up @@ -622,6 +679,7 @@ def test_apply_params_batch_mmt_float():
tile_sizes=[128, 64, 128],
subgroup_m_count=2,
subgroup_n_count=2,
gpu_pipeline_options=candidate_gen.GpuPipelineOptions(),
waves_per_eu=2,
)

Expand Down Expand Up @@ -669,6 +727,7 @@ def test_apply_params_batch_mmt_int():
tile_sizes=[128, 64, 128],
subgroup_m_count=2,
subgroup_n_count=2,
gpu_pipeline_options=candidate_gen.GpuPipelineOptions(),
waves_per_eu=4,
)

Expand All @@ -681,6 +740,8 @@ def test_apply_params_batch_mmt_int():

assert modified
assert "// transform.named_sequence @match_batch_mmt_2x4096x640x640(" in modified
modified = remove_comments(modified)

assert (
"intrinsic = #iree_gpu.mma_layout<MFMA_I32_32x32x16_I8>, subgroup_m_count = 2, subgroup_n_count = 2"
in modified
Expand Down Expand Up @@ -737,6 +798,7 @@ def test_apply_params_broadcast_rhs_mmt():
tile_sizes=[128, 64, 128],
subgroup_m_count=2,
subgroup_n_count=2,
gpu_pipeline_options=candidate_gen.GpuPipelineOptions(),
waves_per_eu=4,
)

Expand All @@ -752,6 +814,8 @@ def test_apply_params_broadcast_rhs_mmt():
"// transform.named_sequence @match_broadcast_rhs_mmt_Bx4096x640x640("
in modified
)
modified = remove_comments(modified)

assert (
"intrinsic = #iree_gpu.mma_layout<MFMA_I32_32x32x16_I8>, subgroup_m_count = 2, subgroup_n_count = 2"
in modified
Expand Down

0 comments on commit e7cce28

Please sign in to comment.