Skip to content

Commit

Permalink
Merge branch 'main' into findhao/operatorbench7
Browse files Browse the repository at this point in the history
  • Loading branch information
FindHao committed Oct 16, 2024
2 parents 8b48eea + ea4433f commit 3fbd962
Show file tree
Hide file tree
Showing 167 changed files with 28,737 additions and 65 deletions.
2 changes: 2 additions & 0 deletions scripts/userbenchmark/upload_scribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ def format_message(self, field_dict):
assert "time" in field_dict, "Missing required Scribe field 'time'"
message = defaultdict(dict)
for field, value in field_dict.items():
if value is None:
continue
if field in self.schema["normal"]:
message["normal"][field] = str(value)
elif field in self.schema["int"]:
Expand Down
2 changes: 1 addition & 1 deletion submodules/flash-attention
Submodule flash-attention updated 157 files
157 changes: 157 additions & 0 deletions torchbenchmark/operator_loader/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import argparse
import sys
import types
from typing import Any, Generator, List, Optional

import torch
from torch._dynamo.backends.cudagraphs import cudagraphs_inner
from torch._inductor.compile_fx import compile_fx
from torch._inductor.utils import gen_gm_and_inputs
from torch._ops import OpOverload
from torch.utils._pytree import tree_map_only

from torchbenchmark.util.triton_op import (
BenchmarkOperator,
register_benchmark_mannually,
)

from .operator_inp_utils import aten, OperatorInputsLoader, to_channels_last

timm_loader = None
huggingface_loader = None
torchbench_loader = None


def maybe_load_operator_inputs_loader():
global timm_loader, huggingface_loader, torchbench_loader
if timm_loader is None:
timm_loader = OperatorInputsLoader.get_timm_loader()
if huggingface_loader is None:
huggingface_loader = OperatorInputsLoader.get_huggingface_loader()
if torchbench_loader is None:
torchbench_loader = OperatorInputsLoader.get_torchbench_loader()


def parse_args(extra_args: Optional[List[str]] = None):
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--channel-list",
action="store_true",
help="Flag to enable channel list benchmarking.",
)
return parser.parse_known_args(extra_args)


def list_operators() -> List[str]:
"""In the original operator benchmark design, all operators are registered in the
operator loader. We need to collect them here.
"""
maybe_load_operator_inputs_loader()
all_ops = (
list(timm_loader.get_all_ops())
+ list(huggingface_loader.get_all_ops())
+ list(torchbench_loader.get_all_ops())
)
# remove duplicate operators
all_ops_str = list(set(str(item) for item in all_ops))
return all_ops_str


def load_opbench_by_name_from_loader(args: argparse.Namespace):
all_ops_str = list_operators()
if args.op not in all_ops_str:
raise ValueError(f"{args.op} is not found in the operator loader.")
# args.op is a string, we need to evaluate it to get the actual operator overload
op_eval = eval(args.op)
return dynamically_create_aten_op_class(op_eval)


def create_operator_class(op_eval: OpOverload):
"""Create a new class for the operator overload."""

def __init__(
self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None
):
BenchmarkOperator.__init__(self, tb_args, extra_args)
native_args, _ = parse_args(extra_args)
self.channel_list = native_args.channel_list
self.device = tb_args.device
self.huggingface_loader = huggingface_loader
self.torchbench_loader = torchbench_loader
self.timm_loader = timm_loader
# We enable cuda graphs by default when we get the input iter. So, we don't
# utilize tritonbench's cuda graphs.
self.use_cuda_graphs = False
self.DEFAULT_PRECISION = "fp16"
assert self.dtype in (
torch.float16,
torch.float32,
), f"AtenOpBenchmark only supports fp16 and fp32, but got {self.dtype}"

def get_input_iter(self) -> Generator:
inps_gens = [self.huggingface_loader, self.torchbench_loader, self.timm_loader]
for inp_gen in inps_gens:
for inp in inp_gen.get_inputs_for_operator(
self.op_eval, self.dtype, self.device
):
args, kwargs = inp
if self.channel_list:
args, kwargs = tree_map_only(
torch.Tensor, to_channels_last, (args, kwargs)
)
gm, gm_args = gen_gm_and_inputs(self.op_eval, args, kwargs)
torch.jit._builtins._register_builtin(
torch.ops.aten.convolution_backward.default,
"aten::convolution_backward",
)
if self.device == "cuda":
cudagraph_eager = cudagraphs_inner(
gm, gm_args, copy_outputs=False, copy_inputs=False
)
self.eager_op = cudagraph_eager
compiled_fn = compile_fx(gm, gm_args)
cudagraph_compiled = cudagraphs_inner(
compiled_fn, gm_args, copy_outputs=False, copy_inputs=False
)
self.inductor_op = cudagraph_compiled
else:
self.eager_op = gm
self.inductor_op = gm

yield gm_args

def eager(self, input):
return lambda: self.eager_op(input)

def inductor(self, input):
return lambda: self.inductor_op(input)

class_attrs = {
"eager": eager,
"inductor": inductor,
"get_input_iter": get_input_iter,
"__init__": __init__,
}
new_class = type("Operator", (BenchmarkOperator,), class_attrs)
new_class.op_eval = op_eval
return new_class


def dynamically_create_aten_op_class(op_eval: OpOverload):
"""
To keep same with custom operators, we dynamically create aten operator classes here.
"""
maybe_load_operator_inputs_loader()
class_name = f"aten_{str(op_eval).replace('.', '_')}"
module_name = f"torchbenchmark.operator_loader.{class_name}"
# create a new module for each operator
op_name_module = types.ModuleType(module_name)
sys.modules[module_name] = op_name_module
op_class = create_operator_class(op_eval)
# need to set __module__ to make _find_op_name_from_module_path work
op_class.__module__ = module_name
op_name_module.Operator = op_class
# because the class is dynamically created, decorator can't get the desired module_path.
register_benchmark_mannually(class_name, "eager", baseline=True)
register_benchmark_mannually(class_name, "inductor")
return op_class
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
Operator: aten._log_softmax.default
cnt: 1, ((T([1024, 30000], f16), 1, False), {})
Operator: aten._log_softmax_backward_data.default
cnt: 1, ((T([1024, 30000], f16), T([1024, 30000], f16), 1, f16), {})
Operator: aten._softmax.default
cnt: 12, ((T([2, 64, 512, 512], f16), -1, False), {})
Operator: aten._softmax_backward_data.default
cnt: 12, ((T([2, 64, 512, 512], f16), T([2, 64, 512, 512], f16), -1, f16), {})
Operator: aten._to_copy.default
cnt: 1, ((T([2, 1, 1, 512], f32),), {'dtype': f16})
Operator: aten._unsafe_view.default
cnt: 36, ((T([2, 64, 512, 64], f16), [128, 512, 64]), {})
cnt: 12, ((T([2, 64, 64, 512], f16), [128, 64, 512]), {})
cnt: 12, ((T([128, 512, 512], f16), [2, 64, 512, 512]), {})
cnt: 12, ((T([128, 512, 64], f16), [2, 64, 512, 64]), {})
cnt: 36, ((T([2, 512, 64, 64], f16), [2, 512, 4096]), {})
cnt: 12, ((T([2, 512, 4096], f16), [1024, 4096]), {})
Operator: aten.add.Tensor
cnt: 4, ((T([2, 512, 128], f16), T([2, 512, 128], f16)), {})
cnt: 12, ((T([2, 64, 512, 512], f16), T([2, 1, 1, 512], f16)), {})
cnt: 72, ((T([2, 512, 4096], f16), T([2, 512, 4096], f16)), {})
cnt: 36, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {})
cnt: 12, ((T([2, 512, 16384], f16), 1.0), {})
cnt: 1, ((T([2, 512, 128], f16), 1.0), {})
cnt: 99, ((T([4096], f16), T([4096], f16)), {})
cnt: 11, ((T([4096, 16384], f16), T([4096, 16384], f16)), {})
cnt: 11, ((T([16384], f16), T([16384], f16)), {})
cnt: 11, ((T([16384, 4096], f16), T([16384, 4096], f16)), {})
cnt: 44, ((T([4096, 4096], f16), T([4096, 4096], f16)), {})
cnt: 1, ((T([30000, 128], f16), T([30000, 128], f16)), {})
Operator: aten.add_.Tensor
cnt: 1, ((T([2, 512, 128], f16), T([1, 512, 128], f16)), {})
Operator: aten.addmm.default
cnt: 1, ((T([4096], f16), T([1024, 128], f16), T([128, 4096], f16, stride=(1, 128))), {})
cnt: 48, ((T([4096], f16), T([1024, 4096], f16), T([4096, 4096], f16, stride=(1, 4096))), {})
cnt: 12, ((T([16384], f16), T([1024, 4096], f16), T([4096, 16384], f16, stride=(1, 4096))), {})
cnt: 12, ((T([4096], f16), T([1024, 16384], f16), T([16384, 4096], f16, stride=(1, 16384))), {})
cnt: 1, ((T([128], f16), T([1024, 4096], f16), T([4096, 128], f16, stride=(1, 4096))), {})
cnt: 1, ((T([30000], f16), T([1024, 128], f16), T([128, 30000], f16, stride=(1, 128))), {})
Operator: aten.bmm.default
cnt: 12, ((T([128, 512, 64], f16), T([128, 64, 512], f16)), {})
cnt: 12, ((T([128, 512, 512], f16), T([128, 512, 64], f16)), {})
cnt: 12, ((T([128, 512, 512], f16, stride=(262144, 1, 512)), T([128, 512, 64], f16)), {})
cnt: 12, ((T([128, 512, 64], f16), T([128, 64, 512], f16, stride=(32768, 1, 64))), {})
cnt: 12, ((T([128, 64, 512], f16, stride=(32768, 1, 64)), T([128, 512, 512], f16)), {})
cnt: 12, ((T([128, 512, 512], f16), T([128, 512, 64], f16, stride=(32768, 1, 512))), {})
Operator: aten.clone.default
cnt: 2, ((T([2, 512], i64),), {})
Operator: aten.copy_.default
cnt: 2, ((T([2, 512], i64), T([2, 512], i64)), {})
Operator: aten.div.Tensor
cnt: 24, ((T([2, 64, 512, 512], f16), 8.0), {})
Operator: aten.embedding.default
cnt: 1, ((T([30000, 128], f16), T([2, 512], i64), 0), {})
cnt: 1, ((T([2, 128], f16), T([2, 512], i64, stride=(0, 1))), {})
cnt: 1, ((T([512, 128], f16), T([1, 512], i64)), {})
Operator: aten.embedding_dense_backward.default
cnt: 1, ((T([1, 512, 128], f16), T([1, 512], i64), 512, -1, False), {})
cnt: 1, ((T([2, 512, 128], f16), T([2, 512], i64, stride=(0, 1)), 2, -1, False), {})
cnt: 1, ((T([2, 512, 128], f16), T([2, 512], i64), 30000, 0, False), {})
Operator: aten.mm.default
cnt: 1, ((T([1024, 30000], f16), T([30000, 128], f16)), {})
cnt: 1, ((T([30000, 1024], f16, stride=(1, 30000)), T([1024, 128], f16)), {})
cnt: 1, ((T([1024, 128], f16), T([128, 4096], f16)), {})
cnt: 1, ((T([128, 1024], f16, stride=(1, 128)), T([1024, 4096], f16)), {})
cnt: 12, ((T([1024, 4096], f16), T([4096, 16384], f16)), {})
cnt: 12, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 16384], f16)), {})
cnt: 12, ((T([1024, 16384], f16), T([16384, 4096], f16)), {})
cnt: 12, ((T([16384, 1024], f16, stride=(1, 16384)), T([1024, 4096], f16)), {})
cnt: 48, ((T([1024, 4096], f16), T([4096, 4096], f16)), {})
cnt: 48, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 4096], f16)), {})
cnt: 1, ((T([1024, 4096], f16), T([4096, 128], f16)), {})
cnt: 1, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 128], f16)), {})
Operator: aten.mul.Scalar
cnt: 1, ((T([2, 512, 128], f16), 3.0), {})
cnt: 12, ((T([2, 512, 16384], f16), 3.0), {})
Operator: aten.mul.Tensor
cnt: 1, ((T([2, 1, 1, 512], f16), -65504.0), {})
cnt: 24, ((T([2, 512, 16384], f16), 0.5), {})
cnt: 24, ((T([2, 512, 16384], f16), 0.044715), {})
cnt: 24, ((T([2, 512, 16384], f16), 0.7978845608028654), {})
cnt: 48, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {})
cnt: 2, ((T([2, 512, 128], f16), 0.5), {})
cnt: 2, ((T([2, 512, 128], f16), 0.044715), {})
cnt: 2, ((T([2, 512, 128], f16), 0.7978845608028654), {})
cnt: 4, ((T([2, 512, 128], f16), T([2, 512, 128], f16)), {})
Operator: aten.native_layer_norm.default
cnt: 2, ((T([2, 512, 128], f16), [128], T([128], f16), T([128], f16), 1e-12), {})
cnt: 24, ((T([2, 512, 4096], f16), [4096], T([4096], f16), T([4096], f16), 1e-12), {})
Operator: aten.native_layer_norm_backward.default
cnt: 2, ((T([2, 512, 128], f16), T([2, 512, 128], f16), [128], T([2, 512, 1], f32), T([2, 512, 1], f32), T([128], f16), T([128], f16), [True, True, True]), {})
cnt: 24, ((T([2, 512, 4096], f16), T([2, 512, 4096], f16), [4096], T([2, 512, 1], f32), T([2, 512, 1], f32), T([4096], f16), T([4096], f16), [True, True, True]), {})
Operator: aten.nll_loss_backward.default
cnt: 1, ((T([], f16), T([1024, 30000], f16), T([1024], i64), None, 1, -100, T([], f16)), {})
Operator: aten.nll_loss_forward.default
cnt: 1, ((T([1024, 30000], f16), T([1024], i64), None, 1, -100), {})
Operator: aten.pow.Tensor_Scalar
cnt: 12, ((T([2, 512, 16384], f16), 3.0), {})
cnt: 1, ((T([2, 512, 128], f16), 3.0), {})
cnt: 1, ((T([2, 512, 128], f16), 2.0), {})
cnt: 12, ((T([2, 512, 16384], f16), 2.0), {})
Operator: aten.rsub.Scalar
cnt: 1, ((T([2, 1, 1, 512], f16), 1.0), {})
Operator: aten.sum.SymInt
cnt: 1, ((T([1024, 30000], f16), [0], True), {})
cnt: 1, ((T([1024, 128], f16), [0], True), {})
cnt: 61, ((T([1024, 4096], f16), [0], True), {})
cnt: 12, ((T([1024, 16384], f16), [0], True), {})
cnt: 1, ((T([2, 512, 128], f16), [0], True), {})
Operator: aten.tanh.default
cnt: 12, ((T([2, 512, 16384], f16),), {})
cnt: 1, ((T([2, 512, 128], f16),), {})
Operator: aten.tanh_backward.default
cnt: 1, ((T([2, 512, 128], f16), T([2, 512, 128], f16)), {})
cnt: 12, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {})
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
Operator: aten._log_softmax.default
cnt: 2, ((T([2, 512], f16), 1, False), {})
Operator: aten._log_softmax_backward_data.default
cnt: 2, ((T([2, 512], f16), T([2, 512], f16), 1, f16), {})
Operator: aten._softmax.default
cnt: 12, ((T([2, 64, 512, 512], f16), -1, False), {})
Operator: aten._softmax_backward_data.default
cnt: 12, ((T([2, 64, 512, 512], f16), T([2, 64, 512, 512], f16), -1, f16), {})
Operator: aten._to_copy.default
cnt: 1, ((T([2, 1, 1, 512], f32),), {'dtype': f16})
Operator: aten._unsafe_view.default
cnt: 36, ((T([2, 64, 512, 64], f16), [128, 512, 64]), {})
cnt: 12, ((T([2, 64, 64, 512], f16), [128, 64, 512]), {})
cnt: 12, ((T([128, 512, 512], f16), [2, 64, 512, 512]), {})
cnt: 12, ((T([128, 512, 64], f16), [2, 64, 512, 64]), {})
cnt: 36, ((T([2, 512, 64, 64], f16), [2, 512, 4096]), {})
cnt: 12, ((T([2, 512, 4096], f16), [1024, 4096]), {})
Operator: aten.add.Tensor
cnt: 1, ((T([2, 512, 128], f16), T([2, 512, 128], f16)), {})
cnt: 12, ((T([2, 64, 512, 512], f16), T([2, 1, 1, 512], f16)), {})
cnt: 72, ((T([2, 512, 4096], f16), T([2, 512, 4096], f16)), {})
cnt: 36, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {})
cnt: 12, ((T([2, 512, 16384], f16), 1.0), {})
cnt: 1, ((T([], f16), T([], f16)), {})
cnt: 99, ((T([4096], f16), T([4096], f16)), {})
cnt: 11, ((T([4096, 16384], f16), T([4096, 16384], f16)), {})
cnt: 11, ((T([16384], f16), T([16384], f16)), {})
cnt: 11, ((T([16384, 4096], f16), T([16384, 4096], f16)), {})
cnt: 44, ((T([4096, 4096], f16), T([4096, 4096], f16)), {})
Operator: aten.add_.Tensor
cnt: 1, ((T([2, 512, 128], f16), T([1, 512, 128], f16)), {})
Operator: aten.addmm.default
cnt: 1, ((T([4096], f16), T([1024, 128], f16), T([128, 4096], f16, stride=(1, 128))), {})
cnt: 48, ((T([4096], f16), T([1024, 4096], f16), T([4096, 4096], f16, stride=(1, 4096))), {})
cnt: 12, ((T([16384], f16), T([1024, 4096], f16), T([4096, 16384], f16, stride=(1, 4096))), {})
cnt: 12, ((T([4096], f16), T([1024, 16384], f16), T([16384, 4096], f16, stride=(1, 16384))), {})
cnt: 1, ((T([2], f16), T([1024, 4096], f16), T([4096, 2], f16, stride=(1, 4096))), {})
Operator: aten.bmm.default
cnt: 12, ((T([128, 512, 64], f16), T([128, 64, 512], f16)), {})
cnt: 12, ((T([128, 512, 512], f16), T([128, 512, 64], f16)), {})
cnt: 12, ((T([128, 512, 512], f16, stride=(262144, 1, 512)), T([128, 512, 64], f16)), {})
cnt: 12, ((T([128, 512, 64], f16), T([128, 64, 512], f16, stride=(32768, 1, 64))), {})
cnt: 12, ((T([128, 64, 512], f16, stride=(32768, 1, 64)), T([128, 512, 512], f16)), {})
cnt: 12, ((T([128, 512, 512], f16), T([128, 512, 64], f16, stride=(32768, 1, 512))), {})
Operator: aten.cat.default
cnt: 1, (([T([2, 512, 1], f16), T([2, 512, 1], f16)], 2), {})
Operator: aten.clamp.default
cnt: 2, ((T([2], i64), 0, 512), {})
Operator: aten.clone.default
cnt: 1, ((T([2, 512], i64),), {})
cnt: 2, ((T([2], i64),), {})
Operator: aten.copy_.default
cnt: 1, ((T([2, 512], i64), T([2, 512], i64)), {})
cnt: 2, ((T([2], i64), T([2], i64)), {})
Operator: aten.div.Tensor
cnt: 24, ((T([2, 64, 512, 512], f16), 8.0), {})
cnt: 2, ((T([], f16), 2), {})
Operator: aten.embedding.default
cnt: 1, ((T([30000, 128], f16), T([2, 512], i64), 0), {})
cnt: 1, ((T([2, 128], f16), T([2, 512], i64, stride=(0, 1))), {})
cnt: 1, ((T([512, 128], f16), T([1, 512], i64)), {})
Operator: aten.embedding_dense_backward.default
cnt: 1, ((T([1, 512, 128], f16), T([1, 512], i64), 512, -1, False), {})
cnt: 1, ((T([2, 512, 128], f16), T([2, 512], i64, stride=(0, 1)), 2, -1, False), {})
cnt: 1, ((T([2, 512, 128], f16), T([2, 512], i64), 30000, 0, False), {})
Operator: aten.mm.default
cnt: 1, ((T([1024, 2], f16), T([2, 4096], f16)), {})
cnt: 1, ((T([2, 1024], f16, stride=(1, 2)), T([1024, 4096], f16)), {})
cnt: 12, ((T([1024, 4096], f16), T([4096, 16384], f16)), {})
cnt: 12, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 16384], f16)), {})
cnt: 12, ((T([1024, 16384], f16), T([16384, 4096], f16)), {})
cnt: 12, ((T([16384, 1024], f16, stride=(1, 16384)), T([1024, 4096], f16)), {})
cnt: 48, ((T([1024, 4096], f16), T([4096, 4096], f16)), {})
cnt: 48, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 4096], f16)), {})
cnt: 1, ((T([1024, 4096], f16), T([4096, 128], f16)), {})
cnt: 1, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 128], f16)), {})
Operator: aten.mul.Scalar
cnt: 12, ((T([2, 512, 16384], f16), 3.0), {})
Operator: aten.mul.Tensor
cnt: 1, ((T([2, 1, 1, 512], f16), -65504.0), {})
cnt: 24, ((T([2, 512, 16384], f16), 0.5), {})
cnt: 24, ((T([2, 512, 16384], f16), 0.044715), {})
cnt: 24, ((T([2, 512, 16384], f16), 0.7978845608028654), {})
cnt: 48, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {})
Operator: aten.native_layer_norm.default
cnt: 1, ((T([2, 512, 128], f16), [128], T([128], f16), T([128], f16), 1e-12), {})
cnt: 24, ((T([2, 512, 4096], f16), [4096], T([4096], f16), T([4096], f16), 1e-12), {})
Operator: aten.native_layer_norm_backward.default
cnt: 24, ((T([2, 512, 4096], f16), T([2, 512, 4096], f16), [4096], T([2, 512, 1], f32), T([2, 512, 1], f32), T([4096], f16), T([4096], f16), [True, True, True]), {})
cnt: 1, ((T([2, 512, 128], f16), T([2, 512, 128], f16), [128], T([2, 512, 1], f32), T([2, 512, 1], f32), T([128], f16), T([128], f16), [True, True, True]), {})
Operator: aten.nll_loss_backward.default
cnt: 2, ((T([], f16), T([2, 512], f16), T([2], i64), None, 1, 512, T([], f16)), {})
Operator: aten.nll_loss_forward.default
cnt: 2, ((T([2, 512], f16), T([2], i64), None, 1, 512), {})
Operator: aten.pow.Tensor_Scalar
cnt: 12, ((T([2, 512, 16384], f16), 3.0), {})
cnt: 12, ((T([2, 512, 16384], f16), 2.0), {})
Operator: aten.rsub.Scalar
cnt: 1, ((T([2, 1, 1, 512], f16), 1.0), {})
Operator: aten.split.Tensor
cnt: 1, ((T([2, 512, 2], f16), 1, -1), {})
Operator: aten.sum.SymInt
cnt: 1, ((T([1024, 2], f16), [0], True), {})
cnt: 61, ((T([1024, 4096], f16), [0], True), {})
cnt: 12, ((T([1024, 16384], f16), [0], True), {})
cnt: 1, ((T([2, 512, 128], f16), [0], True), {})
Operator: aten.tanh.default
cnt: 12, ((T([2, 512, 16384], f16),), {})
Operator: aten.tanh_backward.default
cnt: 12, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {})
Loading

0 comments on commit 3fbd962

Please sign in to comment.