-
Notifications
You must be signed in to change notification settings - Fork 281
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into findhao/operatorbench7
- Loading branch information
Showing
167 changed files
with
28,737 additions
and
65 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Submodule flash-attention
updated
157 files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
import argparse | ||
import sys | ||
import types | ||
from typing import Any, Generator, List, Optional | ||
|
||
import torch | ||
from torch._dynamo.backends.cudagraphs import cudagraphs_inner | ||
from torch._inductor.compile_fx import compile_fx | ||
from torch._inductor.utils import gen_gm_and_inputs | ||
from torch._ops import OpOverload | ||
from torch.utils._pytree import tree_map_only | ||
|
||
from torchbenchmark.util.triton_op import ( | ||
BenchmarkOperator, | ||
register_benchmark_mannually, | ||
) | ||
|
||
from .operator_inp_utils import aten, OperatorInputsLoader, to_channels_last | ||
|
||
timm_loader = None | ||
huggingface_loader = None | ||
torchbench_loader = None | ||
|
||
|
||
def maybe_load_operator_inputs_loader(): | ||
global timm_loader, huggingface_loader, torchbench_loader | ||
if timm_loader is None: | ||
timm_loader = OperatorInputsLoader.get_timm_loader() | ||
if huggingface_loader is None: | ||
huggingface_loader = OperatorInputsLoader.get_huggingface_loader() | ||
if torchbench_loader is None: | ||
torchbench_loader = OperatorInputsLoader.get_torchbench_loader() | ||
|
||
|
||
def parse_args(extra_args: Optional[List[str]] = None): | ||
parser = argparse.ArgumentParser(allow_abbrev=False) | ||
parser.add_argument( | ||
"--channel-list", | ||
action="store_true", | ||
help="Flag to enable channel list benchmarking.", | ||
) | ||
return parser.parse_known_args(extra_args) | ||
|
||
|
||
def list_operators() -> List[str]: | ||
"""In the original operator benchmark design, all operators are registered in the | ||
operator loader. We need to collect them here. | ||
""" | ||
maybe_load_operator_inputs_loader() | ||
all_ops = ( | ||
list(timm_loader.get_all_ops()) | ||
+ list(huggingface_loader.get_all_ops()) | ||
+ list(torchbench_loader.get_all_ops()) | ||
) | ||
# remove duplicate operators | ||
all_ops_str = list(set(str(item) for item in all_ops)) | ||
return all_ops_str | ||
|
||
|
||
def load_opbench_by_name_from_loader(args: argparse.Namespace): | ||
all_ops_str = list_operators() | ||
if args.op not in all_ops_str: | ||
raise ValueError(f"{args.op} is not found in the operator loader.") | ||
# args.op is a string, we need to evaluate it to get the actual operator overload | ||
op_eval = eval(args.op) | ||
return dynamically_create_aten_op_class(op_eval) | ||
|
||
|
||
def create_operator_class(op_eval: OpOverload): | ||
"""Create a new class for the operator overload.""" | ||
|
||
def __init__( | ||
self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None | ||
): | ||
BenchmarkOperator.__init__(self, tb_args, extra_args) | ||
native_args, _ = parse_args(extra_args) | ||
self.channel_list = native_args.channel_list | ||
self.device = tb_args.device | ||
self.huggingface_loader = huggingface_loader | ||
self.torchbench_loader = torchbench_loader | ||
self.timm_loader = timm_loader | ||
# We enable cuda graphs by default when we get the input iter. So, we don't | ||
# utilize tritonbench's cuda graphs. | ||
self.use_cuda_graphs = False | ||
self.DEFAULT_PRECISION = "fp16" | ||
assert self.dtype in ( | ||
torch.float16, | ||
torch.float32, | ||
), f"AtenOpBenchmark only supports fp16 and fp32, but got {self.dtype}" | ||
|
||
def get_input_iter(self) -> Generator: | ||
inps_gens = [self.huggingface_loader, self.torchbench_loader, self.timm_loader] | ||
for inp_gen in inps_gens: | ||
for inp in inp_gen.get_inputs_for_operator( | ||
self.op_eval, self.dtype, self.device | ||
): | ||
args, kwargs = inp | ||
if self.channel_list: | ||
args, kwargs = tree_map_only( | ||
torch.Tensor, to_channels_last, (args, kwargs) | ||
) | ||
gm, gm_args = gen_gm_and_inputs(self.op_eval, args, kwargs) | ||
torch.jit._builtins._register_builtin( | ||
torch.ops.aten.convolution_backward.default, | ||
"aten::convolution_backward", | ||
) | ||
if self.device == "cuda": | ||
cudagraph_eager = cudagraphs_inner( | ||
gm, gm_args, copy_outputs=False, copy_inputs=False | ||
) | ||
self.eager_op = cudagraph_eager | ||
compiled_fn = compile_fx(gm, gm_args) | ||
cudagraph_compiled = cudagraphs_inner( | ||
compiled_fn, gm_args, copy_outputs=False, copy_inputs=False | ||
) | ||
self.inductor_op = cudagraph_compiled | ||
else: | ||
self.eager_op = gm | ||
self.inductor_op = gm | ||
|
||
yield gm_args | ||
|
||
def eager(self, input): | ||
return lambda: self.eager_op(input) | ||
|
||
def inductor(self, input): | ||
return lambda: self.inductor_op(input) | ||
|
||
class_attrs = { | ||
"eager": eager, | ||
"inductor": inductor, | ||
"get_input_iter": get_input_iter, | ||
"__init__": __init__, | ||
} | ||
new_class = type("Operator", (BenchmarkOperator,), class_attrs) | ||
new_class.op_eval = op_eval | ||
return new_class | ||
|
||
|
||
def dynamically_create_aten_op_class(op_eval: OpOverload): | ||
""" | ||
To keep same with custom operators, we dynamically create aten operator classes here. | ||
""" | ||
maybe_load_operator_inputs_loader() | ||
class_name = f"aten_{str(op_eval).replace('.', '_')}" | ||
module_name = f"torchbenchmark.operator_loader.{class_name}" | ||
# create a new module for each operator | ||
op_name_module = types.ModuleType(module_name) | ||
sys.modules[module_name] = op_name_module | ||
op_class = create_operator_class(op_eval) | ||
# need to set __module__ to make _find_op_name_from_module_path work | ||
op_class.__module__ = module_name | ||
op_name_module.Operator = op_class | ||
# because the class is dynamically created, decorator can't get the desired module_path. | ||
register_benchmark_mannually(class_name, "eager", baseline=True) | ||
register_benchmark_mannually(class_name, "inductor") | ||
return op_class |
115 changes: 115 additions & 0 deletions
115
torchbenchmark/operator_loader/operator_inp_logs/hf_train/AlbertForMaskedLM_training.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
Operator: aten._log_softmax.default | ||
cnt: 1, ((T([1024, 30000], f16), 1, False), {}) | ||
Operator: aten._log_softmax_backward_data.default | ||
cnt: 1, ((T([1024, 30000], f16), T([1024, 30000], f16), 1, f16), {}) | ||
Operator: aten._softmax.default | ||
cnt: 12, ((T([2, 64, 512, 512], f16), -1, False), {}) | ||
Operator: aten._softmax_backward_data.default | ||
cnt: 12, ((T([2, 64, 512, 512], f16), T([2, 64, 512, 512], f16), -1, f16), {}) | ||
Operator: aten._to_copy.default | ||
cnt: 1, ((T([2, 1, 1, 512], f32),), {'dtype': f16}) | ||
Operator: aten._unsafe_view.default | ||
cnt: 36, ((T([2, 64, 512, 64], f16), [128, 512, 64]), {}) | ||
cnt: 12, ((T([2, 64, 64, 512], f16), [128, 64, 512]), {}) | ||
cnt: 12, ((T([128, 512, 512], f16), [2, 64, 512, 512]), {}) | ||
cnt: 12, ((T([128, 512, 64], f16), [2, 64, 512, 64]), {}) | ||
cnt: 36, ((T([2, 512, 64, 64], f16), [2, 512, 4096]), {}) | ||
cnt: 12, ((T([2, 512, 4096], f16), [1024, 4096]), {}) | ||
Operator: aten.add.Tensor | ||
cnt: 4, ((T([2, 512, 128], f16), T([2, 512, 128], f16)), {}) | ||
cnt: 12, ((T([2, 64, 512, 512], f16), T([2, 1, 1, 512], f16)), {}) | ||
cnt: 72, ((T([2, 512, 4096], f16), T([2, 512, 4096], f16)), {}) | ||
cnt: 36, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {}) | ||
cnt: 12, ((T([2, 512, 16384], f16), 1.0), {}) | ||
cnt: 1, ((T([2, 512, 128], f16), 1.0), {}) | ||
cnt: 99, ((T([4096], f16), T([4096], f16)), {}) | ||
cnt: 11, ((T([4096, 16384], f16), T([4096, 16384], f16)), {}) | ||
cnt: 11, ((T([16384], f16), T([16384], f16)), {}) | ||
cnt: 11, ((T([16384, 4096], f16), T([16384, 4096], f16)), {}) | ||
cnt: 44, ((T([4096, 4096], f16), T([4096, 4096], f16)), {}) | ||
cnt: 1, ((T([30000, 128], f16), T([30000, 128], f16)), {}) | ||
Operator: aten.add_.Tensor | ||
cnt: 1, ((T([2, 512, 128], f16), T([1, 512, 128], f16)), {}) | ||
Operator: aten.addmm.default | ||
cnt: 1, ((T([4096], f16), T([1024, 128], f16), T([128, 4096], f16, stride=(1, 128))), {}) | ||
cnt: 48, ((T([4096], f16), T([1024, 4096], f16), T([4096, 4096], f16, stride=(1, 4096))), {}) | ||
cnt: 12, ((T([16384], f16), T([1024, 4096], f16), T([4096, 16384], f16, stride=(1, 4096))), {}) | ||
cnt: 12, ((T([4096], f16), T([1024, 16384], f16), T([16384, 4096], f16, stride=(1, 16384))), {}) | ||
cnt: 1, ((T([128], f16), T([1024, 4096], f16), T([4096, 128], f16, stride=(1, 4096))), {}) | ||
cnt: 1, ((T([30000], f16), T([1024, 128], f16), T([128, 30000], f16, stride=(1, 128))), {}) | ||
Operator: aten.bmm.default | ||
cnt: 12, ((T([128, 512, 64], f16), T([128, 64, 512], f16)), {}) | ||
cnt: 12, ((T([128, 512, 512], f16), T([128, 512, 64], f16)), {}) | ||
cnt: 12, ((T([128, 512, 512], f16, stride=(262144, 1, 512)), T([128, 512, 64], f16)), {}) | ||
cnt: 12, ((T([128, 512, 64], f16), T([128, 64, 512], f16, stride=(32768, 1, 64))), {}) | ||
cnt: 12, ((T([128, 64, 512], f16, stride=(32768, 1, 64)), T([128, 512, 512], f16)), {}) | ||
cnt: 12, ((T([128, 512, 512], f16), T([128, 512, 64], f16, stride=(32768, 1, 512))), {}) | ||
Operator: aten.clone.default | ||
cnt: 2, ((T([2, 512], i64),), {}) | ||
Operator: aten.copy_.default | ||
cnt: 2, ((T([2, 512], i64), T([2, 512], i64)), {}) | ||
Operator: aten.div.Tensor | ||
cnt: 24, ((T([2, 64, 512, 512], f16), 8.0), {}) | ||
Operator: aten.embedding.default | ||
cnt: 1, ((T([30000, 128], f16), T([2, 512], i64), 0), {}) | ||
cnt: 1, ((T([2, 128], f16), T([2, 512], i64, stride=(0, 1))), {}) | ||
cnt: 1, ((T([512, 128], f16), T([1, 512], i64)), {}) | ||
Operator: aten.embedding_dense_backward.default | ||
cnt: 1, ((T([1, 512, 128], f16), T([1, 512], i64), 512, -1, False), {}) | ||
cnt: 1, ((T([2, 512, 128], f16), T([2, 512], i64, stride=(0, 1)), 2, -1, False), {}) | ||
cnt: 1, ((T([2, 512, 128], f16), T([2, 512], i64), 30000, 0, False), {}) | ||
Operator: aten.mm.default | ||
cnt: 1, ((T([1024, 30000], f16), T([30000, 128], f16)), {}) | ||
cnt: 1, ((T([30000, 1024], f16, stride=(1, 30000)), T([1024, 128], f16)), {}) | ||
cnt: 1, ((T([1024, 128], f16), T([128, 4096], f16)), {}) | ||
cnt: 1, ((T([128, 1024], f16, stride=(1, 128)), T([1024, 4096], f16)), {}) | ||
cnt: 12, ((T([1024, 4096], f16), T([4096, 16384], f16)), {}) | ||
cnt: 12, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 16384], f16)), {}) | ||
cnt: 12, ((T([1024, 16384], f16), T([16384, 4096], f16)), {}) | ||
cnt: 12, ((T([16384, 1024], f16, stride=(1, 16384)), T([1024, 4096], f16)), {}) | ||
cnt: 48, ((T([1024, 4096], f16), T([4096, 4096], f16)), {}) | ||
cnt: 48, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 4096], f16)), {}) | ||
cnt: 1, ((T([1024, 4096], f16), T([4096, 128], f16)), {}) | ||
cnt: 1, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 128], f16)), {}) | ||
Operator: aten.mul.Scalar | ||
cnt: 1, ((T([2, 512, 128], f16), 3.0), {}) | ||
cnt: 12, ((T([2, 512, 16384], f16), 3.0), {}) | ||
Operator: aten.mul.Tensor | ||
cnt: 1, ((T([2, 1, 1, 512], f16), -65504.0), {}) | ||
cnt: 24, ((T([2, 512, 16384], f16), 0.5), {}) | ||
cnt: 24, ((T([2, 512, 16384], f16), 0.044715), {}) | ||
cnt: 24, ((T([2, 512, 16384], f16), 0.7978845608028654), {}) | ||
cnt: 48, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {}) | ||
cnt: 2, ((T([2, 512, 128], f16), 0.5), {}) | ||
cnt: 2, ((T([2, 512, 128], f16), 0.044715), {}) | ||
cnt: 2, ((T([2, 512, 128], f16), 0.7978845608028654), {}) | ||
cnt: 4, ((T([2, 512, 128], f16), T([2, 512, 128], f16)), {}) | ||
Operator: aten.native_layer_norm.default | ||
cnt: 2, ((T([2, 512, 128], f16), [128], T([128], f16), T([128], f16), 1e-12), {}) | ||
cnt: 24, ((T([2, 512, 4096], f16), [4096], T([4096], f16), T([4096], f16), 1e-12), {}) | ||
Operator: aten.native_layer_norm_backward.default | ||
cnt: 2, ((T([2, 512, 128], f16), T([2, 512, 128], f16), [128], T([2, 512, 1], f32), T([2, 512, 1], f32), T([128], f16), T([128], f16), [True, True, True]), {}) | ||
cnt: 24, ((T([2, 512, 4096], f16), T([2, 512, 4096], f16), [4096], T([2, 512, 1], f32), T([2, 512, 1], f32), T([4096], f16), T([4096], f16), [True, True, True]), {}) | ||
Operator: aten.nll_loss_backward.default | ||
cnt: 1, ((T([], f16), T([1024, 30000], f16), T([1024], i64), None, 1, -100, T([], f16)), {}) | ||
Operator: aten.nll_loss_forward.default | ||
cnt: 1, ((T([1024, 30000], f16), T([1024], i64), None, 1, -100), {}) | ||
Operator: aten.pow.Tensor_Scalar | ||
cnt: 12, ((T([2, 512, 16384], f16), 3.0), {}) | ||
cnt: 1, ((T([2, 512, 128], f16), 3.0), {}) | ||
cnt: 1, ((T([2, 512, 128], f16), 2.0), {}) | ||
cnt: 12, ((T([2, 512, 16384], f16), 2.0), {}) | ||
Operator: aten.rsub.Scalar | ||
cnt: 1, ((T([2, 1, 1, 512], f16), 1.0), {}) | ||
Operator: aten.sum.SymInt | ||
cnt: 1, ((T([1024, 30000], f16), [0], True), {}) | ||
cnt: 1, ((T([1024, 128], f16), [0], True), {}) | ||
cnt: 61, ((T([1024, 4096], f16), [0], True), {}) | ||
cnt: 12, ((T([1024, 16384], f16), [0], True), {}) | ||
cnt: 1, ((T([2, 512, 128], f16), [0], True), {}) | ||
Operator: aten.tanh.default | ||
cnt: 12, ((T([2, 512, 16384], f16),), {}) | ||
cnt: 1, ((T([2, 512, 128], f16),), {}) | ||
Operator: aten.tanh_backward.default | ||
cnt: 1, ((T([2, 512, 128], f16), T([2, 512, 128], f16)), {}) | ||
cnt: 12, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {}) |
110 changes: 110 additions & 0 deletions
110
...chmark/operator_loader/operator_inp_logs/hf_train/AlbertForQuestionAnswering_training.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
Operator: aten._log_softmax.default | ||
cnt: 2, ((T([2, 512], f16), 1, False), {}) | ||
Operator: aten._log_softmax_backward_data.default | ||
cnt: 2, ((T([2, 512], f16), T([2, 512], f16), 1, f16), {}) | ||
Operator: aten._softmax.default | ||
cnt: 12, ((T([2, 64, 512, 512], f16), -1, False), {}) | ||
Operator: aten._softmax_backward_data.default | ||
cnt: 12, ((T([2, 64, 512, 512], f16), T([2, 64, 512, 512], f16), -1, f16), {}) | ||
Operator: aten._to_copy.default | ||
cnt: 1, ((T([2, 1, 1, 512], f32),), {'dtype': f16}) | ||
Operator: aten._unsafe_view.default | ||
cnt: 36, ((T([2, 64, 512, 64], f16), [128, 512, 64]), {}) | ||
cnt: 12, ((T([2, 64, 64, 512], f16), [128, 64, 512]), {}) | ||
cnt: 12, ((T([128, 512, 512], f16), [2, 64, 512, 512]), {}) | ||
cnt: 12, ((T([128, 512, 64], f16), [2, 64, 512, 64]), {}) | ||
cnt: 36, ((T([2, 512, 64, 64], f16), [2, 512, 4096]), {}) | ||
cnt: 12, ((T([2, 512, 4096], f16), [1024, 4096]), {}) | ||
Operator: aten.add.Tensor | ||
cnt: 1, ((T([2, 512, 128], f16), T([2, 512, 128], f16)), {}) | ||
cnt: 12, ((T([2, 64, 512, 512], f16), T([2, 1, 1, 512], f16)), {}) | ||
cnt: 72, ((T([2, 512, 4096], f16), T([2, 512, 4096], f16)), {}) | ||
cnt: 36, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {}) | ||
cnt: 12, ((T([2, 512, 16384], f16), 1.0), {}) | ||
cnt: 1, ((T([], f16), T([], f16)), {}) | ||
cnt: 99, ((T([4096], f16), T([4096], f16)), {}) | ||
cnt: 11, ((T([4096, 16384], f16), T([4096, 16384], f16)), {}) | ||
cnt: 11, ((T([16384], f16), T([16384], f16)), {}) | ||
cnt: 11, ((T([16384, 4096], f16), T([16384, 4096], f16)), {}) | ||
cnt: 44, ((T([4096, 4096], f16), T([4096, 4096], f16)), {}) | ||
Operator: aten.add_.Tensor | ||
cnt: 1, ((T([2, 512, 128], f16), T([1, 512, 128], f16)), {}) | ||
Operator: aten.addmm.default | ||
cnt: 1, ((T([4096], f16), T([1024, 128], f16), T([128, 4096], f16, stride=(1, 128))), {}) | ||
cnt: 48, ((T([4096], f16), T([1024, 4096], f16), T([4096, 4096], f16, stride=(1, 4096))), {}) | ||
cnt: 12, ((T([16384], f16), T([1024, 4096], f16), T([4096, 16384], f16, stride=(1, 4096))), {}) | ||
cnt: 12, ((T([4096], f16), T([1024, 16384], f16), T([16384, 4096], f16, stride=(1, 16384))), {}) | ||
cnt: 1, ((T([2], f16), T([1024, 4096], f16), T([4096, 2], f16, stride=(1, 4096))), {}) | ||
Operator: aten.bmm.default | ||
cnt: 12, ((T([128, 512, 64], f16), T([128, 64, 512], f16)), {}) | ||
cnt: 12, ((T([128, 512, 512], f16), T([128, 512, 64], f16)), {}) | ||
cnt: 12, ((T([128, 512, 512], f16, stride=(262144, 1, 512)), T([128, 512, 64], f16)), {}) | ||
cnt: 12, ((T([128, 512, 64], f16), T([128, 64, 512], f16, stride=(32768, 1, 64))), {}) | ||
cnt: 12, ((T([128, 64, 512], f16, stride=(32768, 1, 64)), T([128, 512, 512], f16)), {}) | ||
cnt: 12, ((T([128, 512, 512], f16), T([128, 512, 64], f16, stride=(32768, 1, 512))), {}) | ||
Operator: aten.cat.default | ||
cnt: 1, (([T([2, 512, 1], f16), T([2, 512, 1], f16)], 2), {}) | ||
Operator: aten.clamp.default | ||
cnt: 2, ((T([2], i64), 0, 512), {}) | ||
Operator: aten.clone.default | ||
cnt: 1, ((T([2, 512], i64),), {}) | ||
cnt: 2, ((T([2], i64),), {}) | ||
Operator: aten.copy_.default | ||
cnt: 1, ((T([2, 512], i64), T([2, 512], i64)), {}) | ||
cnt: 2, ((T([2], i64), T([2], i64)), {}) | ||
Operator: aten.div.Tensor | ||
cnt: 24, ((T([2, 64, 512, 512], f16), 8.0), {}) | ||
cnt: 2, ((T([], f16), 2), {}) | ||
Operator: aten.embedding.default | ||
cnt: 1, ((T([30000, 128], f16), T([2, 512], i64), 0), {}) | ||
cnt: 1, ((T([2, 128], f16), T([2, 512], i64, stride=(0, 1))), {}) | ||
cnt: 1, ((T([512, 128], f16), T([1, 512], i64)), {}) | ||
Operator: aten.embedding_dense_backward.default | ||
cnt: 1, ((T([1, 512, 128], f16), T([1, 512], i64), 512, -1, False), {}) | ||
cnt: 1, ((T([2, 512, 128], f16), T([2, 512], i64, stride=(0, 1)), 2, -1, False), {}) | ||
cnt: 1, ((T([2, 512, 128], f16), T([2, 512], i64), 30000, 0, False), {}) | ||
Operator: aten.mm.default | ||
cnt: 1, ((T([1024, 2], f16), T([2, 4096], f16)), {}) | ||
cnt: 1, ((T([2, 1024], f16, stride=(1, 2)), T([1024, 4096], f16)), {}) | ||
cnt: 12, ((T([1024, 4096], f16), T([4096, 16384], f16)), {}) | ||
cnt: 12, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 16384], f16)), {}) | ||
cnt: 12, ((T([1024, 16384], f16), T([16384, 4096], f16)), {}) | ||
cnt: 12, ((T([16384, 1024], f16, stride=(1, 16384)), T([1024, 4096], f16)), {}) | ||
cnt: 48, ((T([1024, 4096], f16), T([4096, 4096], f16)), {}) | ||
cnt: 48, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 4096], f16)), {}) | ||
cnt: 1, ((T([1024, 4096], f16), T([4096, 128], f16)), {}) | ||
cnt: 1, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 128], f16)), {}) | ||
Operator: aten.mul.Scalar | ||
cnt: 12, ((T([2, 512, 16384], f16), 3.0), {}) | ||
Operator: aten.mul.Tensor | ||
cnt: 1, ((T([2, 1, 1, 512], f16), -65504.0), {}) | ||
cnt: 24, ((T([2, 512, 16384], f16), 0.5), {}) | ||
cnt: 24, ((T([2, 512, 16384], f16), 0.044715), {}) | ||
cnt: 24, ((T([2, 512, 16384], f16), 0.7978845608028654), {}) | ||
cnt: 48, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {}) | ||
Operator: aten.native_layer_norm.default | ||
cnt: 1, ((T([2, 512, 128], f16), [128], T([128], f16), T([128], f16), 1e-12), {}) | ||
cnt: 24, ((T([2, 512, 4096], f16), [4096], T([4096], f16), T([4096], f16), 1e-12), {}) | ||
Operator: aten.native_layer_norm_backward.default | ||
cnt: 24, ((T([2, 512, 4096], f16), T([2, 512, 4096], f16), [4096], T([2, 512, 1], f32), T([2, 512, 1], f32), T([4096], f16), T([4096], f16), [True, True, True]), {}) | ||
cnt: 1, ((T([2, 512, 128], f16), T([2, 512, 128], f16), [128], T([2, 512, 1], f32), T([2, 512, 1], f32), T([128], f16), T([128], f16), [True, True, True]), {}) | ||
Operator: aten.nll_loss_backward.default | ||
cnt: 2, ((T([], f16), T([2, 512], f16), T([2], i64), None, 1, 512, T([], f16)), {}) | ||
Operator: aten.nll_loss_forward.default | ||
cnt: 2, ((T([2, 512], f16), T([2], i64), None, 1, 512), {}) | ||
Operator: aten.pow.Tensor_Scalar | ||
cnt: 12, ((T([2, 512, 16384], f16), 3.0), {}) | ||
cnt: 12, ((T([2, 512, 16384], f16), 2.0), {}) | ||
Operator: aten.rsub.Scalar | ||
cnt: 1, ((T([2, 1, 1, 512], f16), 1.0), {}) | ||
Operator: aten.split.Tensor | ||
cnt: 1, ((T([2, 512, 2], f16), 1, -1), {}) | ||
Operator: aten.sum.SymInt | ||
cnt: 1, ((T([1024, 2], f16), [0], True), {}) | ||
cnt: 61, ((T([1024, 4096], f16), [0], True), {}) | ||
cnt: 12, ((T([1024, 16384], f16), [0], True), {}) | ||
cnt: 1, ((T([2, 512, 128], f16), [0], True), {}) | ||
Operator: aten.tanh.default | ||
cnt: 12, ((T([2, 512, 16384], f16),), {}) | ||
Operator: aten.tanh_backward.default | ||
cnt: 12, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {}) |
Oops, something went wrong.