diff --git a/experimental/torch_xla2/docs/ops_registry.md b/experimental/torch_xla2/docs/ops_registry.md new file mode 100644 index 00000000000..c0e68f42fc4 --- /dev/null +++ b/experimental/torch_xla2/docs/ops_registry.md @@ -0,0 +1,40 @@ +# Ops Registry + +## Background + +In the [How it works](how_it_works.md) doc, we mentioned 2 important pieces: + +1. A mechanism to route `ATen` ops to implementation written in + Jax or in PyTorch, and + +2. The ops themselves. + + +Ops Registry is there to help us to organize the ops themselves. + +An op implementation can written in terms of Jax, or in other PyTorch ops. +The latter is also known as "decompositions". For decompositions, +one need to be careful of not introducing circular dependencies. + +Here we simply store the operator implementations in a dictionary, +which key the torch / Aten callable that we wish to override, and +value an instance of `Operator` class. + +`Operator` class has this schema: + +```python +@dataclasses.dataclass +class Operator: + torch_op: TorchCallable + func: Union[TorchCallable, JaxCallable] + is_jax_function: bool + is_user_defined: bool + needs_env: bool +``` + +The `torch_op` is the corresponding torch callable, and `func` the implementation. `is_jax_function` is True if `func` is implemented using Jax, False if `func` is implemented using other torch ops. We can use this information to decide how to call it. + +If `needs_env` is true, `func` will recieve an extra kwarg with name `env`. +This will be the "Environment" in which this op operate on. In particular, +the environment will contain the Jax random number generator key, that might be useful for ops like `aten::rand`. + diff --git a/experimental/torch_xla2/examples/basic_training.py b/experimental/torch_xla2/examples/basic_training.py index 5d3f5a734c5..29e55700a32 100644 --- a/experimental/torch_xla2/examples/basic_training.py +++ b/experimental/torch_xla2/examples/basic_training.py @@ -10,7 +10,11 @@ from torch.utils import _pytree as pytree import torchvision import torchvision.transforms as transforms -import torch_xla2 +import torch_xla2.tensor + + +xla_env = torch_xla2.tensor.Environment(0) +mode = xla_env.mode() # PyTorch TensorBoard support from torch.utils.tensorboard import SummaryWriter @@ -80,6 +84,7 @@ def forward(self, x): model = GarmentClassifier() +model = xla_env.to_xla(model) loss_fn = torch.nn.CrossEntropyLoss() @@ -96,13 +101,6 @@ def forward(self, x): print('Total loss for this batch: {}'.format(loss.item())) # Optimizers specified in the torch.optim package - -# NEW: Move model to XLA device -state_dict = model.state_dict() -state_dict = pytree.tree_map_only(torch.Tensor, - torch_xla2.tensor.move_to_device, state_dict) -model.load_state_dict(state_dict, strict=False, assign=True) - optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9) def train_one_epoch(epoch_index, tb_writer): @@ -115,14 +113,14 @@ def train_one_epoch(epoch_index, tb_writer): for i, data in enumerate(training_loader): # Every data instance is an input + label pair # NEW: Move model to XLA device - data = pytree.tree_map_only(torch.Tensor, - torch_xla2.tensor.move_to_device, data) + data = xla_env.to_xla(data) inputs, labels = data # Zero your gradients for every batch! optimizer.zero_grad() # Make predictions for this batch + outputs = model(inputs) # Compute the loss and its gradients @@ -169,14 +167,11 @@ def train_one_epoch(epoch_index, tb_writer): # Disable gradient computation and reduce memory consumption. with torch.no_grad(): for i, vdata in enumerate(validation_loader): - # NOTE: move to XLA device - vinputs, vlabels = pytree.tree_map_only( - torch.Tensor, - torch_xla2.tensor.move_to_device, - vdata) - voutputs = model(vinputs) # call model's forward - vloss = loss_fn(voutputs, vlabels) - running_vloss += vloss + # NOTE: move to XLA device + vinputs, vlabels = xla_env.to_xla(vdata) + voutputs = model(vinputs) # call model's forward + vloss = loss_fn(voutputs, vlabels) + running_vloss += vloss avg_vloss = running_vloss / (i + 1) print('LOSS train {} valid {}'.format(avg_loss, avg_vloss)) diff --git a/experimental/torch_xla2/examples/basic_training_jax.py b/experimental/torch_xla2/examples/basic_training_jax.py index 3941fcdf8fe..ae6efdf4856 100644 --- a/experimental/torch_xla2/examples/basic_training_jax.py +++ b/experimental/torch_xla2/examples/basic_training_jax.py @@ -8,7 +8,7 @@ import torchvision import torchvision.transforms as transforms import torch_xla2 -import torch_xla2.extra +import torch_xla2.interop import jax import optax import numpy as np @@ -91,7 +91,7 @@ def forward(self, x): def jax_loss(weights, data, label): pred = jax_func(weights, data) - loss = torch_xla2.extra.call_torch(loss_fn, pred, label) + loss = torch_xla2.interop.call_torch(loss_fn, pred, label) return loss grad_fn = jax.jit(jax.value_and_grad(jax_loss)) @@ -155,12 +155,6 @@ def train_one_epoch(jax_weights, opt_state, epoch_index, tb_writer): # Make sure gradient tracking is on, and do a pass over the data model.train(True) - # NEW: Move model to XLA device - state_dict = model.state_dict() - state_dict = pytree.tree_map_only(torch.Tensor, - torch_xla2.tensor.move_to_device, state_dict) - model.load_state_dict(state_dict, strict=False, assign=True) - avg_loss, opt_state = train_one_epoch(jax_weights, opt_state, epoch_number, writer) running_vloss = 0.0 @@ -174,7 +168,7 @@ def train_one_epoch(jax_weights, opt_state, epoch_index, tb_writer): vinputs, vlabels = pytree.tree_map_only(torch.Tensor, torch_xla2.tensor.t2j, vdata) voutputs = jax_func(jax_weights, (vinputs, )) # call model's forward - vloss = torch_xla2.extra.call_torch(loss_fn, voutputs, vlabels) + vloss = torch_xla2.interop.call_torch(loss_fn, voutputs, vlabels) running_vloss += vloss avg_vloss = running_vloss / (i + 1) diff --git a/experimental/torch_xla2/examples/eager_mode.py b/experimental/torch_xla2/examples/eager_mode.py index 358ee6256c6..755f24b0d2b 100644 --- a/experimental/torch_xla2/examples/eager_mode.py +++ b/experimental/torch_xla2/examples/eager_mode.py @@ -1,10 +1,9 @@ - -from torch_xla2.tensor import move_to_device import torch_xla2 from torch import nn from torch.nn import functional as F import torch -from torch.utils import _pytree as pytree + +xla_env = torch_xla2.default_env() class MyModel(nn.Module): @@ -22,21 +21,21 @@ def forward(self, x): return x m = MyModel() +m = xla_env.to_xla(m) # Execute this model using torch inputs = (torch.randn(3, 3, 28, 28), ) +inputs = xla_env.to_xla(inputs) -inputs, state_dict = pytree.tree_map_only(torch.Tensor, move_to_device, (inputs, m.state_dict())) -m.load_state_dict(state_dict, strict=False, assign=True) print(m(*inputs)) print('---=====') -from torch_xla2.extra import jax_jit +from torch_xla2.interop import jax_jit @jax_jit def model_func(param, inputs): return torch.func.functional_call(m, param, inputs) -print(model_func(state_dict, inputs)) +print(model_func(m.state_dict(), inputs)) diff --git a/experimental/torch_xla2/test/gemma/test_gemma.py b/experimental/torch_xla2/test/gemma/test_gemma.py index bd0bb21dbb1..4d91bc6f9b0 100644 --- a/experimental/torch_xla2/test/gemma/test_gemma.py +++ b/experimental/torch_xla2/test/gemma/test_gemma.py @@ -74,7 +74,7 @@ def test_gemma(self): weights, jax_func = torch_xla2.extract_jax(model) inputs_jax = pytree.tree_map_only( - torch.Tensor, torch_xla2.tensor.move_to_device, inputs) + torch.Tensor, torch_xla2.tensor.t2j, inputs) import jax print(jax.jit(jax_func)(weights, inputs_jax)) diff --git a/experimental/torch_xla2/test/llama/test_llama.py b/experimental/torch_xla2/test/llama/test_llama.py index dae7bf0cc5c..083116ab89e 100644 --- a/experimental/torch_xla2/test/llama/test_llama.py +++ b/experimental/torch_xla2/test/llama/test_llama.py @@ -1,8 +1,5 @@ -import unittest -import jax import torch -from torch._functorch.make_functional import make_functional_with_buffers -from torch_xla2 import tensor, ops # pylint: disable=unused-import +from torch_xla2 import tensor # pylint: disable=unused-import import torch_xla2 from .. import test_base diff --git a/experimental/torch_xla2/test/test_context.py b/experimental/torch_xla2/test/test_context.py index 1a75a7d23d0..a6bcda5113a 100644 --- a/experimental/torch_xla2/test/test_context.py +++ b/experimental/torch_xla2/test/test_context.py @@ -1,20 +1,22 @@ import unittest import torch -import torch_xla2 from torch_xla2 import tensor +xla_env = tensor.Environment(0) + class TestContext(unittest.TestCase): + def test_mode_context_manager(self): - with torch_xla2.mode(): + with xla_env: x = torch.full((3, 3), -1) self.assertIsInstance(x, tensor.XLATensor2) y = x.abs() self.assertIsInstance(y, tensor.XLATensor2) @staticmethod - @torch_xla2.mode() + @xla_env def _test_mode_decorator(): x = torch.full((3, 3), -1) y = x.abs() diff --git a/experimental/torch_xla2/test/test_core_aten_ops.py b/experimental/torch_xla2/test/test_core_aten_ops.py index 357e41c9101..6a1cef306be 100644 --- a/experimental/torch_xla2/test/test_core_aten_ops.py +++ b/experimental/torch_xla2/test/test_core_aten_ops.py @@ -1,7 +1,6 @@ import unittest import torch -from torch_xla2 import ops_registry from torch_xla2 import tensor from . import test_base @@ -34,12 +33,13 @@ def run_export_and_compare(testcase, rtol=1e-5, equal_nan=True, ignore_indices=False): + with testcase.subTest("torch_eval"): res = func(*args, **kwargs) with testcase.subTest("torch_xla2_eval"): - args2, kwargs2 = pytree.tree_map_only(torch.Tensor, tensor.move_to_device, - (args, kwargs)) - res2 = func(*args2, **kwargs2) + args2, kwargs2 = testcase.env.to_xla((args, kwargs)) + with testcase.env: + res2 = func(*args2, **kwargs2) res2 = pytree.tree_map_only(tensor.XLATensor2, lambda t: t.torch(), res2) # import pdb; pdb.set_trace() with testcase.subTest("torch_xla2_diff:" + str(atol)): @@ -61,11 +61,11 @@ class TestCoreAtenOps(unittest.TestCase): @classmethod def setUpClass(cls): super().setUpClass() - ops_registry.print_missing_ops() def setUp(self): super().setUp() torch.manual_seed(0) + self.env = tensor.Environment(0) def test_aten_abs_0(self): args = (torch.randn((10, 10)).to(torch.float32),) @@ -2109,7 +2109,7 @@ def test_aten_logit_0(self): def test_aten_logit_1(self): args = (torch.randn((10, 10)).to(torch.float16),) kwargs = dict() - run_export_and_compare(self, torch.ops.aten.logit, args, kwargs) + run_export_and_compare(self, torch.ops.aten.logit, args, kwargs, atol=0.01,) def test_aten_logit_2(self): args = (torch.randint(0, 10, (10, 10)).to(torch.int32),) @@ -3639,8 +3639,9 @@ def test_aten__softmax_1(self): def _compare_sorted_result(self, args): res = torch.ops.aten.sort(*args) with self.subTest("torch_xla2_eval"): - args2 = pytree.tree_map_only(torch.Tensor, tensor.move_to_device, args) - res2 = torch.ops.aten.sort(*args2) + args2 = self.env.to_xla(args) + with self.env: + res2 = torch.ops.aten.sort(*args2) # The second argument is the sorted index. These might not be # identical from torch vs. jax; but both can be correct diff --git a/experimental/torch_xla2/test/test_extra.py b/experimental/torch_xla2/test/test_extra.py deleted file mode 100644 index 768488d6a99..00000000000 --- a/experimental/torch_xla2/test/test_extra.py +++ /dev/null @@ -1,64 +0,0 @@ -import unittest -import torch -import torch.nn.functional as F -import jax -import jax.numpy as jnp -import torch_xla2 -from torch_xla2 import tensor, extra - - -class ExtraTest(unittest.TestCase): - - def setUp(self): - torch.manual_seed(0) - - def test_standard_callable(self): - def f(a, b): - return torch.add(a, b) - - a = jnp.ones((10, )) - b = jnp.ones((10, )) - - c = extra.jax_view(f)(a, b) - self.assertTrue(jnp.allclose(c, a + b)) - - def f2(a, b): - return jnp.add(a, b) - - a = tensor.move_to_device(torch.ones((10, ))) - b = tensor.move_to_device(torch.ones((10, ))) - c2 = extra.torch_view(f2)(a, b) - - self.assertTrue(jnp.allclose(c2._elem, c)) - - - - def test_fori_loop(self): - a = tensor.move_to_device(torch.ones((10, 10))) - - def body(i, c): - return c + a[i] - - init_val = tensor.move_to_device(torch.zeros(10)) - res = extra.fori_loop(0, 10, body, init_val) - expect = torch.ones(10) * 10 - self.assertTrue(torch.allclose(tensor.j2t(res._elem), expect)) - - def test_jax_jit(self): - - # functions that acts on torch tensor - def f(a, b): - return torch.sin(a) + torch.cos(b) - - fjitted = extra.jax_jit(f) - a = torch.rand((10, 10)) - b = torch.rand((10, 10)) - aj = tensor.move_to_device(a) - bj = tensor.move_to_device(b) - res = f(a, b) - res2 = fjitted(aj, bj) - self.assertTrue(torch.allclose(res, tensor.j2t(res2._elem))) - - -if __name__ == '__main__': - unittest.main() diff --git a/experimental/torch_xla2/test/test_functions.py b/experimental/torch_xla2/test/test_functions.py index 76e842d6fdd..2d624b25b5b 100644 --- a/experimental/torch_xla2/test/test_functions.py +++ b/experimental/torch_xla2/test/test_functions.py @@ -3,12 +3,14 @@ from absl.testing import parameterized import torch import torch_xla2 -import torch_xla2.functions import torch_xla2.tensor class TestTorchFunctions(parameterized.TestCase): + def setUp(self): + self.env = torch_xla2.tensor.Environment(0) + @parameterized.named_parameters( ('tensor_2d', lambda: torch.tensor([[0.1, 1.2], [2.2, 3.1], [4.9, 5.2]])), ('tensor_1d', lambda: torch.tensor([0, 1],)), @@ -32,7 +34,7 @@ class TestTorchFunctions(parameterized.TestCase): def test_tensor_constructor(self, func: Callable[[], torch.Tensor]): expected = func() - with torch_xla2.functions.XLAFunctionMode(): + with self.env: actual = func() self.assertIsInstance(actual, torch_xla2.tensor.XLATensor2) diff --git a/experimental/torch_xla2/test/test_mutations.py b/experimental/torch_xla2/test/test_mutations.py index 2f9ddca975b..50d78aa0fae 100644 --- a/experimental/torch_xla2/test/test_mutations.py +++ b/experimental/torch_xla2/test/test_mutations.py @@ -6,46 +6,43 @@ class TestMutations(TestCase): - def test_add(self): - x = torch.tensor([1, 2, 3], dtype=torch.int32) - y = torch.tensor([4, 5, 6], dtype=torch.int32) + def setUp(self): + self.env = torch_xla2.tensor.Environment(0) - x = torch_xla2.tensor.move_to_device(x) - y = torch_xla2.tensor.move_to_device(y) - x.add_(y) - xt = torch_xla2.tensor.j2t(x._elem) - self.assertEqual(xt, torch.tensor([5, 7, 9], dtype=torch.int32)) + def test_add(self): + with self.env: + x = torch.tensor([1, 2, 3], dtype=torch.int32) + y = torch.tensor([4, 5, 6], dtype=torch.int32) + x.add_(y) + xt = torch_xla2.tensor.j2t(x._elem) + self.assertEqual(xt, torch.tensor([5, 7, 9], dtype=torch.int32)) def test_sub(self): - x = torch.tensor([1, 2, 3], dtype=torch.int32) - y = torch.tensor([4, 5, 6], dtype=torch.int32) - - x = torch_xla2.tensor.move_to_device(x) - y = torch_xla2.tensor.move_to_device(y) - x.sub_(y) - xt = torch_xla2.tensor.j2t(x._elem) - self.assertEqual(xt, torch.tensor([-3, -3, -3], dtype=torch.int32)) + with self.env: + x = torch.tensor([1, 2, 3], dtype=torch.int32) + y = torch.tensor([4, 5, 6], dtype=torch.int32) + x.sub_(y) + xt = torch_xla2.tensor.j2t(x._elem) + self.assertEqual(xt, torch.tensor([-3, -3, -3], dtype=torch.int32)) def test_mul(self): - x = torch.tensor([1, 2, 3], dtype=torch.int32) - y = torch.tensor([4, 5, 6], dtype=torch.int32) + with self.env: + x = torch.tensor([1, 2, 3], dtype=torch.int32) + y = torch.tensor([4, 5, 6], dtype=torch.int32) - x = torch_xla2.tensor.move_to_device(x) - y = torch_xla2.tensor.move_to_device(y) - x.mul_(y) - xt = torch_xla2.tensor.j2t(x._elem) - self.assertEqual(xt, torch.tensor([4, 10, 18], dtype=torch.int32)) + x.mul_(y) + xt = torch_xla2.tensor.j2t(x._elem) + self.assertEqual(xt, torch.tensor([4, 10, 18], dtype=torch.int32)) def test_div(self): - x = torch.tensor([1, 2, 3], dtype=torch.int32) - y = torch.tensor([4, 5, 6], dtype=torch.int32) - - x = torch_xla2.tensor.move_to_device(x) - y = torch_xla2.tensor.move_to_device(y) - x.div_(y) - xt = torch_xla2.tensor.j2t(x._elem) - self.assertEqual(xt, - torch.tensor([1. / 4, 2. / 5, 3. / 6], dtype=torch.float)) + with self.env: + x = torch.tensor([1, 2, 3], dtype=torch.int32) + y = torch.tensor([4, 5, 6], dtype=torch.int32) + + x.div_(y) + xt = torch_xla2.tensor.j2t(x._elem) + self.assertEqual(xt, + torch.tensor([1. / 4, 2. / 5, 3. / 6], dtype=torch.float)) if __name__ == '__main__': diff --git a/experimental/torch_xla2/test/test_ops.py b/experimental/torch_xla2/test/test_ops.py index 5f6fdbbeab2..20686f2fe6c 100644 --- a/experimental/torch_xla2/test/test_ops.py +++ b/experimental/torch_xla2/test/test_ops.py @@ -7,7 +7,6 @@ instantiate_device_type_tests, ops) from torch.utils import _pytree as pytree from torch_xla2 import tensor -import torch_xla2 skiplist = { @@ -626,10 +625,9 @@ def run_export_and_compare(testcase, with testcase.subTest("torch_eval"): res = func(sample_input.input, *sample_input.args, **sample_input.kwargs) with testcase.subTest("torch_xla2_eval"): - input2, args2, kwargs2 = pytree.tree_map_only( - torch.Tensor, tensor.move_to_device, - (sample_input.input, sample_input.args, sample_input.kwargs)) - with torch_xla2.mode(): + input2, args2, kwargs2 = testcase.env.to_xla(( + sample_input.input, sample_input.args, sample_input.kwargs)) + with testcase.env: res2 = func(input2, *args2, **kwargs2) res2 = pytree.tree_map_only(tensor.XLATensor2, lambda t: t.torch(), res2) with testcase.subTest("torch_xla2_diff:" + str(atol)): @@ -655,6 +653,9 @@ class TestOpInfo(TestCase): def setUpClass(cls): print('op_db size: ', len(op_db), 'testing: ', len(ops_to_test)) + def setUp(self): + self.env = tensor.Environment(0) + @ops(ops_to_test, allowed_dtypes=(torch.float32, torch.long)) def test_reference_eager(self, device, dtype, op): sample_inputs = op.sample_inputs(device, dtype) diff --git a/experimental/torch_xla2/torch_xla2/__init__.py b/experimental/torch_xla2/torch_xla2/__init__.py index b0bb20712d4..bd0e00fa6ca 100644 --- a/experimental/torch_xla2/torch_xla2/__init__.py +++ b/experimental/torch_xla2/torch_xla2/__init__.py @@ -1,31 +1,34 @@ -import contextlib import jax import torch from torch._functorch import make_functional from torch.utils import _pytree as pytree -from torch_xla2 import export, _ops, ops_registry, tensor, tf_integration, functions +from torch_xla2 import export, tensor, tf_integration jax.config.update('jax_enable_x64', True) +env = None +def default_env(): + global env + if env is None: + env = tensor.Environment(0) + return env -@contextlib.contextmanager -def mode(): - with tensor.XLADispatchMode(), functions.XLAFunctionMode(): - yield -def extract_jax(mod: torch.nn.Module): +def extract_jax(mod: torch.nn.Module, env=None): """Returns a pytree of jax.ndarray and a jax callable.""" + if env is None: + env = default_env() func, weights, buffer = make_functional.make_functional_with_buffers(mod) - states = (weights, buffer) + states = mod.state_dict() + states = pytree.tree_map_only(torch.Tensor, tensor.t2j, states) #@jax.jit def jax_func(states, inputs): - (states, inputs) = tensor.wrap((states, inputs)) - weights, buffer = states - with tensor.XLADispatchMode(): - res = func(weights, buffer, *inputs) - return tensor.unwrap(res) + (states, inputs) = env.j2t_iso((states, inputs)) + with env: + res = torch.func.functional_call(mod, states, inputs) + return env.t2j_iso(res) return states, jax_func diff --git a/experimental/torch_xla2/torch_xla2/_ops.py b/experimental/torch_xla2/torch_xla2/_ops.py deleted file mode 100644 index e3650234372..00000000000 --- a/experimental/torch_xla2/torch_xla2/_ops.py +++ /dev/null @@ -1,1781 +0,0 @@ -# pylint: disable -"""Torch ops implemented using jax.""" -import sys - -import jax -from jax import numpy as jnp -import numpy as np -import torch -from torch_xla2 import ops_registry -from torch_xla2 import tensor - - -class TorchFunctionLowering: - - def __init__(self, func, is_jax_func, should_jit=False): - if is_jax_func and should_jit: - func = jax.jit(func) - self.func = func - self.is_jax_func = is_jax_func - - def __call__(self, *args, **kwargs): - if self.is_jax_func: - (args, kwargs) = tensor.unwrap((args, kwargs)) - res = self.func(*args, **kwargs) - if self.is_jax_func: - res = tensor.wrap(res) - return res - - -def op(aten_op, is_jax_func=True): - """if is_jax_func is true, then the function it will register - - should takes jax array as input and returns jax array. - - Which means we need to wrap it - """ - - def inner(func): - ops_registry.lowerings.register(aten_op, - TorchFunctionLowering(func, is_jax_func)) - return func - - return inner - - -@op(torch.ops.aten.view_copy) -@op(torch.ops.aten.view) -@op(torch.ops.aten._unsafe_view) -@op(torch.ops.aten.reshape) -def _aten_unsafe_view(x, shape): - return jnp.reshape(x, shape) - - -@op(torch.ops.aten.add) -def _aten_add(x, y, *, alpha=1): - """if isinstance(x, jnp.ndarray) and isinstance(y, jnp.ndarray): - - assert x.dtype == y.dtype, (x.dtype, y.dtype) - """ - return x + y * alpha - - -@op(torch.ops.aten.copy_, is_jax_func=False) -def _aten_copy(x, y, memory_format=None): - if isinstance(x, tensor.XLATensor2): - x._elem = y._elem - elif isinstance(x, tensor.SliceView): - x.mutate(y) - return x - - -@op(torch.ops.aten.clone) -def _aten_clone(x, memory_format=None): - return jnp.copy(x) - - -@op(torch.ops.aten.full) -def _aten_full(size, value, **kwargs): - return jnp.full(size, value) - - -@op(torch.ops.aten.index_copy) -def _aten_index_copy(x, dim, indexes, source): - # return jax.lax.scatter(x, index, dim) - dims = [] - for i in range(len(x.shape)): - if i == dim: - dims.append(indexes) - else: - dims.append(slice(None, None, None)) - return x.at[dim].set(source) - - -@op(torch.ops.aten.select) -@op(torch.ops.aten.index_select) -@op(torch.ops.aten.select_copy) -def _aten_index_select(x, dim, indexes): - dims = [] - for i in range(len(x.shape)): - if i == dim: - dims.append(indexes) - else: - dims.append(slice(None, None, None)) - return x[tuple(dims)] - - -@op(torch.ops.aten.mean) -def _aten_mean(x, dim=None, keepdim=False): - return jnp.mean(x, dim, keepdims=keepdim) - - -def _torch_binary_scalar_type(scalar, tensor): - if "float" in str(tensor.dtype): - return tensor.dtype - - if isinstance(scalar, int): - if "int" in str(tensor.dtype): - return tensor.dtype - - return jnp.float32 - - -@op(torch.ops.aten.sub) -def _aten_sub(x, y): - if isinstance(x, float): - dtype = _torch_binary_scalar_type(x, y) - x = jnp.array(x, dtype=dtype) - if isinstance(y, float): - dtype = _torch_binary_scalar_type(y, x) - y = jnp.array(y, dtype=dtype) - return x - y - - -@op(torch.ops.aten.mm) -def _aten_mm(x, y): - res = x @ y - return res - - -@op(torch.ops.aten.mul) -def _aten_mul(x, y): - return x * y - - -@op(torch.ops.aten.silu) -def _aten_silu(x): - return jax.nn.silu(x) - - -@op(torch.ops.aten.t) -def _aten_t(x): - return jnp.transpose(x) - - -@op(torch.ops.aten.transpose) -@op(torch.ops.aten.transpose_copy) -def _aten_transpose(x, dim0, dim1): - shape = list(range(len(x.shape))) - shape[dim0], shape[dim1] = shape[dim1], shape[dim0] - return jnp.transpose(x, shape) - - -@op(torch.ops.aten.triu) -def _aten_triu(m, k): - return jnp.triu(m, k) - - -@op(torch.ops.aten.slice) -@op(torch.ops.aten.slice_copy) -def _aten_slice(self, dim=0, start=None, end=None, step=1): - if end == sys.maxsize: - end = self.shape[dim] - sl = slice(start, end, step) - dims = [] - for i in range(len(self.shape)): - if i == dim: - dims.append(sl) - else: - dims.append(slice(None, None, None)) - return self[tuple(dims)] - - -@op(torch.ops.aten.detach) -def _aten_detach(self): - return self - - -@op(torch.ops.aten.view_as_real) -def _aten_view_as_real(x): - real = jnp.real(x) - im = jnp.imag(x) - res = jnp.stack([real, im], -1) - return res - - -@op(torch.ops.aten.stack) -def _aten_stack(tensors, dim=0): - return jnp.stack(tensors, dim) - - -@op(torch.ops.aten._softmax) -def _aten_softmax(x, dim, halftofloat): - return jax.nn.softmax(x, dim) - - -@op(torch.ops.aten.pow) -def _aten_pow(x, y): - if isinstance(y, int): - y = float(y) - return jnp.power(x, y) - - -@op(torch.ops.aten.view_as_complex) -def _aten_view_as_complex(input): - if input.dtype == jnp.bfloat16: - input = input.astype(jnp.float32) - x, y = input[..., 0], input[..., 1] - return jax.lax.complex(x, y) - - -@op(torch.ops.aten.div) -def _aten_div(x, y, rounding_mode=""): - res = x / y - if rounding_mode == "trunc": - res = jnp.trunc(res) - return res - - -@op(torch.ops.aten.div_, is_jax_func=False) -def _aten_div_(x, y, rounding_mode=""): - x._elem = _aten_div(x._elem, y._elem, rounding_mode) - return x - - -@op(torch.ops.aten.true_divide) -def _aten_true_divide(x, y): - return x / y - - -@op(torch.ops.aten.bmm) -def _aten_bmm(x, y): - res = x @ y - return res - # return jnp.einsum('bnm,bmk->bnk', x, y) - - -@op(torch.ops.aten.embedding) -# embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -def _aten_embedding(a, w, padding_idx=-1): - return jnp.take(a, w, axis=0) - - -@op(torch.ops.aten.rsqrt) -def _aten_rsqrt(x): - if isinstance(x, int): - x = float(x) - if x.dtype == jnp.int32: - x = x.astype(jnp.float32) - return jax.lax.rsqrt(x) - - -@op(torch.ops.aten.expand) -@op(torch.ops.aten.expand_copy) -def _aten_expand(x, dims): - - def fix_dims(d, xs): - if d == -1: - return xs - return d - - dims = [fix_dims(p, s) for p, s in zip(dims, x.shape)] - return jnp.broadcast_to(x, dims) - - -@op(torch.ops.aten.dot) -def _aten_dot(x, y): - return jnp.dot(x, y) - - -@op(torch.ops.aten._to_copy) -def _aten__to_copy(self, **kwargs): - dtype = tensor.t2j_dtype(kwargs["dtype"]) - if dtype != self.dtype: - return self.astype(dtype) - return jnp.copy(self) - - -@op(torch.ops.aten.empty) -def _aten_empty(sizes, **kwargs): - return jnp.zeros(sizes) - - -@op(torch.ops.aten.index_put_) -@op(torch.ops.aten.index_put) -def _aten_index_put(self, indexes, values, accumulate=False): - indexes = [slice(None, None, None) if i is None else i for i in indexes] - indexes = tuple(indexes) - if accumulate: - return self.at[indexes].add(values) - else: - return self.at[indexes].set(values) - - -@op(torch.ops.aten.index) -@op(torch.ops.aten._unsafe_index) -@op(torch.ops.aten.index.Tensor) -def _aten_index(self, indexes): - indexes = [slice(None, None, None) if i is None else i for i in indexes] - indexes = tuple(indexes) - return self[indexes] - - -@op(torch.ops.aten.split) -@op(torch.ops.aten.split_copy) -@op(torch.ops.aten.split_with_sizes) -def split_with_sizes(x, sizes, dim=0): - """Splits an array `x` into sub-arrays based on static sizes `sizes`. - - Args: - x: The input array to split. - sizes: A 1D array of integer sizes for each sub-array. - - Returns: - A list of sub-arrays. - """ - if isinstance(sizes, int): - # split equal size - new_sizes = [sizes] * (x.shape[dim] // sizes) - sizes = new_sizes - rank = x.ndim - splits = np.cumsum(sizes) # Cumulative sum for split points - - def make_range(rank, dim, start, end): - res = [slice(None, None, None)] * rank - res[dim] = slice(start, end) - return tuple(res) - - return [ - x[make_range(rank, dim, start, end)] - for start, end in zip([0] + list(splits[:-1]), splits) - ] - - -@op(torch.ops.aten.permute) -@op(torch.ops.aten.permute_copy) -def permute(t, dims): - return jnp.transpose(t, dims) - - -@op(torch.ops.aten.unsqueeze) -@op(torch.ops.aten.unsqueeze_copy) -@op(torch.ops.aten.unsqueeze.default) -def _aten_unsqueeze(self, dim): - if dim < 0: - dim += self.ndim + 1 - return jnp.expand_dims(self, dim) - - -@op(torch.ops.aten.ne) -def _aten_ne(x, y): - return jnp.not_equal(x, y) - - -@op(torch.ops.aten.cumsum) -def _aten_cumsum(x, y, dtype=None): - dtype = tensor.t2j_dtype(dtype) - res = jnp.cumsum(x, y, dtype) - return res - - -@op(torch.ops.aten.native_layer_norm) -def _aten_native_layer_norm(input, - normalized_shape, - weight=None, - bias=None, - eps=1e-5): - """Implements layer normalization in Jax as defined by `aten::native_layer_norm`. - - Args: - input: The input tensor. - normalized_shape: A list of integer dimensions to be normalized over. - weight: Optional weight tensor for the affine transformation. - bias: Optional bias tensor for the affine transformation. - eps: A small epsilon value for numerical stability. - - Returns: - output: The normalized tensor. - mean: The calculated mean tensor. - std: The calculated standard deviation tensor. - """ - if isinstance(normalized_shape, int): - normalized_shape = [normalized_shape] - axis = [i for i, d in enumerate(input.shape) if d in normalized_shape] - - # Calculate mean and standard deviation - mean = jnp.mean(input, axis=axis, keepdims=True) - var = jnp.var(input, axis=axis, keepdims=True) - rstd = jax.lax.rsqrt(var + eps) - - # Normalize the input - norm_x = (input - mean) * rstd - - # Apply affine transformation (if provided) - if weight is not None: - norm_x *= weight - if bias is not None: - norm_x += bias - return norm_x, mean, rstd - - -# - func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor -@op(torch.ops.aten.addmm) -@op(torch.ops.aten.addmv) -def _aten_addmm(self, mat1, mat2, *, beta=1.0, alpha=1.0): - alpha = jnp.array(alpha).astype(mat1.dtype) - beta = jnp.array(beta).astype(mat1.dtype) - self *= beta - self += alpha * jnp.matmul(mat1, mat2) - return self - -@op(torch.ops.aten.addbmm.default) -def _aten_addbmm(input, batch1, batch2, *, beta=1, alpha=1): - alpha = jnp.array(alpha).astype(batch1.dtype) - beta = jnp.array(beta).astype(batch1.dtype) - mm = jnp.einsum('bxy, byz -> xz', batch1, batch2) - return jax.lax.cond(beta == 0, - lambda: alpha * mm, - lambda: beta*input + alpha*mm) - - -@op(torch.ops.aten.gelu) -def _aten_gelu(self, *, approximate="none"): - approx = approximate == "tanh" - return jax.nn.gelu(self, approx) - - -@op(torch.ops.aten.squeeze) -@op(torch.ops.aten.squeeze_copy) -def _aten_squeeze_dim(self, dim): - """Squeezes a Jax tensor by removing a single dimension of size 1. - - Args: - self: The input tensor. - dim: The dimension to squeeze. - - Returns: - The squeezed tensor with the specified dimension removed if it is 1, - otherwise the original tensor is returned. - """ - - # Validate input arguments - if not isinstance(self, jnp.ndarray): - raise TypeError(f"Expected a Jax tensor, got {type(self)}.") - if isinstance(dim, int): - dim = [dim] - - # Check if the specified dimension has size 1 - if all([self.shape[d] != 1 for d in dim]): - return self - - # Use slicing to remove the dimension if it is 1 - new_shape = list(self.shape) - - def fix_dim(p): - if p < 0: - return p + len(self.shape) - return p - - dim = [fix_dim(d) for d in dim] - new_shape = [p for i, p in enumerate(self.shape) if i not in dim or p != 1] - return self.reshape(new_shape) - - -@op(torch.ops.aten.convolution) -def _aten_convolution( - input, - weight, - bias, - stride, - padding, - dilation, - transposed, - output_padding, - groups, -): - if transposed: - raise NotImplementedError("Transposed convolution is not implemented.") - - def make_padding(padding): - return ((p, p) for p in padding) - - def create_default_conv_dimension_numbers(num_spatial_dims): - # Ref: https://github.com/openxla/xla/blob/main/xla/client/xla_builder.cc#L4211 - # (batch dimension, feature dimension, spatial dimensions...) - lhs_spec = [0, 1] - # (out feature dimension, in feature dimension, spatial dimensions...) - rhs_spec = [0, 1] - # (batch dimension, feature dimension, spatial dimensions...) - out_spec = [0, 1] - for i in range(0, num_spatial_dims): - lhs_spec.append(i + 2) - rhs_spec.append(i + 2) - out_spec.append(i + 2) - return jax.lax.ConvDimensionNumbers( - *map(tuple, (lhs_spec, rhs_spec, out_spec))) - - res = jax.lax.conv_general_dilated( - input, - weight, - stride, - make_padding(padding), - lhs_dilation=(1,) * len(stride), - rhs_dilation=dilation, - dimension_numbers=create_default_conv_dimension_numbers(len(stride)), - feature_group_count=groups, - batch_group_count=1, - ) - - if bias is not None: - # TODO(qihqi): bias always on channel? - if len(bias.shape) == 1: - shape = [1] * len(res.shape) - shape[1] = bias.shape[0] - bias = bias.reshape(tuple(shape)) - res = res + bias - return res - - -# _native_batch_norm_legit(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps) -@op(torch.ops.aten._native_batch_norm_legit) -def _aten__native_batch_norm_legit(input, weight, bias, running_mean, - running_var, training, momentum, eps): - return _aten__native_batch_norm_legit_no_training(input, weight, bias, - running_mean, running_var, - momentum, eps) - - -@op(torch.ops.aten._native_batch_norm_legit_no_training) -def _aten__native_batch_norm_legit_no_training(input, weight, bias, - running_mean, running_var, - momentum, eps): - if weight is None: - weight = jnp.ones_like(running_mean) - if bias is None: - bias = jnp.zeros_like(running_mean) - - def broadcast(t): - return jax.lax.broadcast_in_dim(t, input.shape, broadcast_dimensions=(1,)) - - a = input - broadcast(running_mean) - b = broadcast(jnp.sqrt(running_var + eps)) - return ( - a / b * broadcast(weight) + broadcast(bias), - jnp.array([]), - jnp.array([]), - ) - - -@op(torch.ops.aten.relu) -def _aten_relu(self): - return jax.nn.relu(self) - - -@op(torch.ops.aten.cat) -def _aten_cat(tensors, dims=0): - return jnp.concatenate(tensors, dims) - - -@op(torch.ops.aten.max_pool2d_with_indices) -@op(torch.ops.aten.max_pool3d_with_indices) -def _aten_max_pool2d_with_indices(inputs, - kernel_size, - strides, - padding=0, - dilation=1, - ceil_mode=False): - num_batch_dims = len(inputs.shape) - len(kernel_size) - 1 - kernel_size = tuple(kernel_size) - strides = tuple(strides) - if isinstance(padding, int): - padding = tuple((padding, padding) for _ in range(len(kernel_size))) - elif isinstance(padding, list): - padding = tuple((p, p) for p in padding) - - window_shape = kernel_size - num_batch_dims = inputs.ndim - (len(window_shape) + 1) - strides = strides or (1,) * len(window_shape) - assert len(window_shape) == len( - strides), f'len({window_shape}) must equal len({strides})' - strides = (1,) * (1 + num_batch_dims) + strides - dims = (1,) * (1 + num_batch_dims) + window_shape - - is_single_input = False - if num_batch_dims == 0: - # add singleton batch dimension because lax.reduce_window always - # needs a batch dimension. - inputs = inputs[None] - strides = (1,) + strides - dims = (1,) + dims - is_single_input = True - - assert inputs.ndim == len(dims), f'len({inputs.shape}) != len({dims})' - if not isinstance(padding, str): - padding = tuple(map(tuple, padding)) - assert len(padding) == len(window_shape), ( - f'padding {padding} must specify pads for same number of dims as ' - f'window_shape {window_shape}') - assert all([len(x) == 2 for x in padding - ]), f'each entry in padding {padding} must be length 2' - padding = ((0, 0), (0, 0)) + padding - - indices = jnp.arange(np.prod(inputs.shape)).reshape(inputs.shape) - - def reduce_fn(a, b): - ai, av = a - bi, bv = b - which = av > bv - return jnp.where(which, ai, bi), jnp.where(which, av, bv) - - init_val = -jnp.inf - if inputs.dtype in (jnp.int32, jnp.int64): - init_val = -(1 << 31) - init_val = jnp.array(init_val).astype(inputs.dtype) - - indices, y = jax.lax.reduce_window((indices, inputs), (0, init_val), - reduce_fn, dims, strides, padding) - if is_single_input: - indices = jnp.squeeze(indices, axis=0) - y = jnp.squeeze(y, axis=0) - return y, indices - - batch_result = pool(inputs, -jnp.inf, jax.lax.max, kernel_size, strides, - padding) - indices = pool(inputs, 0, jnp.argmax, kernel_size, strides, padding) - return batch_result, indices - - -# TODO add more ops - - -@op(torch.ops.aten.min) -def _aten_min(x, axis=None): - return jnp.min(x, axis=axis), jnp.argmin(x, axis=axis).astype(jnp.int64) - - -@op(torch.ops.aten.amin) -def _aten_amin(x, dim=None, keepdim=False): - return _with_reduction_scalar(jnp.amin, x, dim, keepdim) - - -@op(torch.ops.aten.argmin) -def _aten_argmin(self, dim=None, keepdim=False): - return _with_reduction_scalar( - jnp.argmin, self, dim, keepdim) - - -@op(torch.ops.aten.sin) -def _aten_sin(x): - return jnp.sin(x) - - -@op(torch.ops.aten.sym_size) -def _aten_sym_size(x, dim): - return x.shape[dim] - - -@op(torch.ops.aten.var) -@op(torch.ops.prims.var) -def _aten_var(x, dim=None, *, correction=1, keepdim=False, out=None): - return jnp.var(x, axis=dim, ddof=correction, keepdims=keepdim) - - -@op(torch.ops.prims.broadcast_in_dim) -def _prims_broadcast_in_dim(t, shape, broadcast_dimensions): - return jax.lax.broadcast_in_dim( - t, shape, broadcast_dimensions=broadcast_dimensions) - - -# aten.native_group_norm -- should use decomp table -# func: native_group_norm(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps) -> (Tensor, Tensor, Tensor) - - -@op(torch.ops.aten.native_group_norm) -def _aten_native_group_norm(input, weight, bias, N, C, HxW, group, eps=1e-5): - """Group Normalization implementation in JAX. - - Args: - input: Input tensor. Expected shape (batch_size, channels, ... spatial dims - ...) - weight: Optional scaling (gamma) parameter. Shape (channels,) - bias: Optional shifting (beta) parameter. Shape (channels,) - N: Batch size. - C: Number of channels. - HxW: Product of spatial dimensions (number of elements per channel after - flattening). - group: Number of groups for Group Normalization. - eps: Small value added for numerical stability. - - Returns: - A tuple of (normalized_output, mean, rstd) - """ - - input_shape = input.shape - - # Reshape for group-wise normalization - reshaped_input = jnp.reshape(input, (1, N * group, -1)) - - # **Core Group Normalization** - def group_norm_body(x): # Function to apply within each group - mean = jnp.mean(x, axis=-1, keepdims=True) - var = jnp.var(x, axis=-1, keepdims=True) - rstd = jax.lax.rsqrt(var + eps) # Reciprocal of std with epsilon - normalized = (x - mean) * rstd - return normalized, mean, rstd - - normalized, group_mean, group_rstd = jax.lax.map(group_norm_body, - reshaped_input) - - # Reshape back to original input shape - output = jnp.reshape(normalized, input_shape) - - # **Affine transformation** - affine_shape = [-1 if i == 1 else 1 for i in range(input.ndim) - ] # Shape for broadcasting - if weight is not None and bias is not None: - output = bias.reshape(affine_shape) + output * weight.reshape(affine_shape) - elif weight is not None: - output = output * weight.reshape(affine_shape) - elif bias is not None: - output = output + bias.reshape(affine_shape) - - # Reshape mean and rstd - mean = jnp.reshape(group_mean, (N, group)) - rstd = jnp.reshape(group_rstd, (N, group)) - - return output, mean, rstd - - -@op(torch.ops.aten.linalg_vector_norm) -def _aten_linalg_vector_norm(self, ord=2, dim=None, keepdim=False, dtype=None): - """Calculates the vector norm along specified dimensions. - - Args: - self: The input tensor. - ord: The order of the norm. Can be a float or 'inf', '-inf', 'fro'. - Default is 2 (Euclidean norm). - dim: Dimensions along which to calculate the norm. If None, the norm is - calculated over all dimensions. - keepdim: Whether to keep the reduced dimensions. - dtype: Optional data type for the output. - - Returns: - The tensor containing the calculated vector norms. - """ - - if ord not in {2, float("inf"), float("-inf"), "fro"}: - raise ValueError( - f"Unsupported ord value: {ord}. Supported values are 2, inf, -inf, and" - " 'fro'.") - - # Special cases (for efficiency and clarity) - if ord == 2: # Euclidean norm - result = jnp.sqrt(jnp.sum(jnp.abs(self)**2, axis=dim, keepdims=keepdim)) - - elif ord == float("inf"): - result = jnp.max(jnp.abs(self), axis=dim, keepdims=keepdim) - - elif ord == float("-inf"): - result = jnp.min(jnp.abs(self), axis=dim, keepdims=keepdim) - - elif ord == "fro": # Frobenius norm - result = jnp.sqrt(jnp.sum(jnp.abs(self)**2, axis=dim, keepdims=keepdim)) - - else: # General case (e.g., ord = 1, ord = 3) - result = jnp.sum( - jnp.abs(self)**ord, axis=dim, keepdims=keepdim)**(1.0 / ord) - - # (Optional) dtype conversion - if dtype is not None: - result = result.astype(dtype) - - return result - - -# aten.reflection_pad1d -@op(torch.ops.aten.reflection_pad1d) -def _aten_reflection_pad1d(input, padding): - rank = len(input.shape) - pad_size = [(0, 0)] * rank - pad_size[-1] = padding - return jnp.pad(input, pad_size, mode="reflect") - - -# aten.alias -@op(torch.ops.aten.alias) -def _aten_alias(self, *args): - return self - - -# aten.sinh -@op(torch.ops.aten.sinh) -def _aten_sinh(self): - return jnp.sinh(self) - - -# aten.native_layer_norm_backward -@op(torch.ops.aten.native_layer_norm_backward) -def _aten_native_layer_norm_backward(grad_out, - input, - normalized_shape, - weight, - bias, - eps=1e-5): - """Implements the backward pass of layer normalization in Jax as defined by `aten::native_layer_norm_backward`. - - Args: - grad_out: The gradient of the output tensor. - input: The input tensor. - normalized_shape: A list of integer dimensions to be normalized over. - weight: Optional weight tensor for the affine transformation. - bias: Optional bias tensor for the affine transformation. - eps: A small epsilon value for numerical stability. - - Returns: - A tuple of (grad_input, grad_weight, grad_bias). - """ - return jax.lax.native_layer_norm_backward(grad_out, input, normalized_shape, - weight, bias, eps) - - -# aten.reflection_pad3d_backward -# aten.reflection_pad2d - - -# aten.atanh -@op(torch.ops.aten.atanh) -def _aten_atanh(self): - return jnp.arctanh(self) - - -# aten.bitwise_not -@op(torch.ops.aten.bitwise_not) -def _aten_bitwise_not(self): - return ~self - - -# aten.embedding_dense_backward - - -# aten.sum -@op(torch.ops.aten.sum) -def _aten_sum(self, dim=None, keepdim=False, dtype=None): - return jnp.sum(self, axis=dim, keepdims=keepdim, dtype=dtype) - - -# aten.sqrt -@op(torch.ops.aten.sqrt) -def _aten_sqrt(self): - return jnp.sqrt(self) - - -@op(torch.ops.aten.tan) -def _aten_tanh(self): - return jnp.tan(self) - - -# aten.tanh -@op(torch.ops.aten.tanh) -def _aten_tanh(self): - return jnp.tanh(self) - - -# aten.ceil -@op(torch.ops.aten.ceil) -def _aten_ceil(self): - return jnp.ceil(self) - - -# aten.asin -@op(torch.ops.aten.asin) -def _aten_asin(self): - return jnp.arcsin(self) - - -# aten.minimum -@op(torch.ops.aten.minimum) -def _aten_minimum(self, other): - return jnp.minimum(self, other) - - -# aten.max_pool2d_backward - - -def _scatter_index(dim, index): - """Returns a tuple of indexes; - - The first is to select in input (to modify), - the second is to select from the values. - """ - index_shape = list(index.shape) - input_indexes = [] - source_indexes = [] - for i in range(len(index_shape)): - source_indexes.append(slice(0, index_shape[i])) - if i == dim: - input_indexes.append(index) - else: - target_shape = [1] * len(index_shape) - target_shape[i] = index_shape[i] - input_indexes.append( - jnp.broadcast_to( - jnp.arange(index_shape[i]).reshape(target_shape), index_shape)) - return tuple(input_indexes), tuple(source_indexes) - - -# aten.scatter_add -@op(torch.ops.aten.scatter_add) -def _aten_scatter_add(input, dim, index, src): - """JAX implementation of scatter, mimicking torch.scatter behavior""" - - input_indexes, source_indexes = _scatter_index(dim, index) - return input.at[input_indexes].add(src[source_indexes]) - - -# aten.logical_not - - -# aten.sign -@op(torch.ops.aten.sign) -def _aten_sign(x): - return jnp.sign(x) - - -# aten.sigmoid -@op(torch.ops.aten.sigmoid) -def _aten_sigmoid(x): - if x.dtype in (jnp.int32, jnp.int64): - x = x.astype(jnp.float32) - return jax.nn.sigmoid(x) - - -# implement aten.asinh in jax -@op(torch.ops.aten.asinh) -def _aten_asinh(self): - return jnp.arcsinh(self) - - -# aten.atan -@op(torch.ops.aten.atan) -def _aten_atan(self): - return jnp.arctan(self) - - -# aten.scatter_reduce -@op(torch.ops.aten.scatter_reduce) -def _aten_scatter_reduce(input, dim, index, src, reduce, *, include_self=True): - input_indexes, source_indexes = _scatter_index(dim, index) - if reduce == "sum": - return input.at[input_indexes].add(src[source_indexes]) - elif reduce == "prod": - return input.at[input_indexes].multiply(src[source_indexes]) - elif reduce == "mean": - return input.at[input_indexes].add(src[source_indexes]) - elif reduce == "amax": - return input.at[input_indexes].max(src[source_indexes]) - elif reduce == "amin": - return input.at[input_indexes].min(src[source_indexes]) - else: - raise RuntimeError('Unknow reduction type: ', reduce) - - -# aten.acos -@op(torch.ops.aten.acos) -def _aten_acos(self): - return jnp.arccos(self) - - -# aten.sym_storage_offset -# aten.native_layer_norm_backward -# aten.max_pool3d_with_indices - - -# aten.gt -@op(torch.ops.aten.gt) -def _aten_gt(self, other): - return self > other - - -# aten.pixel_shuffle -@op(torch.ops.aten.pixel_shuffle) -def _aten_pixel_shuffle(x, upscale_factor): - """PixelShuffle implementation in JAX. - - Args: - x: Input tensor. Typically a feature map. - upscale_factor: Integer by which to upscale the spatial dimensions. - - Returns: - Tensor after PixelShuffle operation. - """ - - batch_size, channels, height, width = x.shape - - if channels % (upscale_factor**2) != 0: - raise ValueError( - 'Number of channels must be divisible by the square of the upscale factor.' - ) - - new_channels = channels // (upscale_factor**2) - new_height = height * upscale_factor - new_width = width * upscale_factor - - x = x.reshape(batch_size, new_channels, upscale_factor, upscale_factor, - height, width) - x = jnp.transpose(x, - (0, 1, 2, 4, 3, 5)) # Move channels to spatial dimensions - x = x.reshape(batch_size, new_channels, new_height, new_width) - - return x - - -# aten.sym_stride -# aten.lt -@op(torch.ops.aten.lt) -def _aten_lt(self, other): - return self < other - - -def pool(inputs, init, reduce_fn, window_shape, strides, padding): - """Helper function to define pooling functions. - - Pooling functions are implemented using the ReduceWindow XLA op. - NOTE: Be aware that pooling is not generally differentiable. - That means providing a reduce_fn that is differentiable does not imply that - pool is differentiable. - - Args: - inputs: input data with dimensions (batch, window dims..., features). - init: the initial value for the reduction - reduce_fn: a reduce function of the form ``(T, T) -> T``. - window_shape: a shape tuple defining the window to reduce over. - strides: a sequence of ``n`` integers, representing the inter-window - strides (default: ``(1, ..., 1)``). - padding: either the string ``'SAME'``, the string ``'VALID'``, or a sequence - of ``n`` ``(low, high)`` integer pairs that give the padding to apply before - and after each spatial dimension. - Returns: - The output of the reduction for each window slice. - """ - num_batch_dims = inputs.ndim - (len(window_shape) + 1) - strides = strides or (1,) * len(window_shape) - assert len(window_shape) == len( - strides), f'len({window_shape}) must equal len({strides})' - strides = (1,) * (1 + num_batch_dims) + strides - dims = (1,) * (1 + num_batch_dims) + window_shape - - is_single_input = False - if num_batch_dims == 0: - # add singleton batch dimension because lax.reduce_window always - # needs a batch dimension. - inputs = inputs[None] - strides = (1,) + strides - dims = (1,) + dims - is_single_input = True - - assert inputs.ndim == len(dims), f'len({inputs.shape}) != len({dims})' - if not isinstance(padding, str): - padding = tuple(map(tuple, padding)) - assert len(padding) == len(window_shape), ( - f'padding {padding} must specify pads for same number of dims as ' - f'window_shape {window_shape}') - assert all([len(x) == 2 for x in padding - ]), f'each entry in padding {padding} must be length 2' - padding = ((0, 0), (0, 0)) + padding - y = jax.lax.reduce_window(inputs, init, reduce_fn, dims, strides, padding) - if is_single_input: - y = jnp.squeeze(y, axis=0) - return y - - -@op(torch.ops.aten._adaptive_avg_pool3d) -def _aten_adaptive_avg_pool3d(x, output_shape): - return _aten_adaptive_avg_pool(x, output_shape, 3) - - -@op(torch.ops.aten._adaptive_avg_pool2d) -def _aten_adaptive_avg_pool3d(x, output_shape): - return _aten_adaptive_avg_pool(x, output_shape, 2) - - -def _aten_adaptive_avg_pool(x, output_shape, pool_dim): - - def adaptive_kernel_size(input_shape, output_shape): - sizes = [1, 1] - spatial_dim_off = len(input_shape) - pool_dim - for spatial_dim in range(pool_dim): - sizes.append(input_shape[spatial_dim_off + spatial_dim] // - output_shape[spatial_dim]) - return tuple(sizes) - - kernel_sizes = adaptive_kernel_size(x.shape, output_shape) - y = pool(x, 0.0, jax.lax.add, kernel_sizes, kernel_sizes, padding='VALID') - - div_shape = list(x.shape) - num_batch_dims = len(x.shape) - pool_dim - 1 - div_shape[num_batch_dims] = 1 - div_shape = tuple(div_shape) - if len(div_shape) - 2 == len(kernel_sizes): - div_shape = (1,) + div_shape[1:] - y = y / pool( - jnp.ones(div_shape), 0.0, jax.lax.add, kernel_sizes, kernel_sizes, - 'VALID') - return y - - -# aten.avg_pool2d -@op(torch.ops.aten.avg_pool2d) -@op(torch.ops.aten.avg_pool3d) -def _aten_avg_pool(inputs, - kernel_size, - strides=None, - padding=0, - ceil_mode=False, - count_include_pad=True, - divisor_override=None): - - num_batch_dims = len(inputs.shape) - len(kernel_size) - 1 - kernel_size = tuple(kernel_size) - strides = tuple(strides) - if isinstance(padding, int): - padding = tuple((padding, padding) for _ in range(len(kernel_size))) - elif isinstance(padding, list): - padding = tuple((p, p) for p in padding) - - y = pool(inputs, 0.0, jax.lax.add, kernel_size, strides, padding) - if count_include_pad: - y = y / np.prod(kernel_size) - else: - div_shape = list(inputs.shape) - div_shape[num_batch_dims] = 1 - div_shape = tuple(div_shape) - if len(div_shape) - 2 == len(kernel_size): - div_shape = (1,) + div_shape[1:] - y = y / pool( - jnp.ones(div_shape), 0.0, jax.lax.add, kernel_size, strides, padding) - return y - - -# aten.sym_numel -# aten.reciprocal -@op(torch.ops.aten.reciprocal) -def _aten_reciprocal(a): - return 1 / a - - -# aten.scatter -@op(torch.ops.aten.select_scatter) -def _aten_select_scatter(input, src, dim, index): - input_indexes = [] - for x in range(len(input.shape)): - if x == dim: - input_indexes.append(index) - else: - input_indexes.append(slice(None, None, None)) - return input.at[tuple(input_indexes)].set(src) - - -@op(torch.ops.aten.scatter.src) -def _aten_scatter_src(input, dim, index, src, reduce=None): - input_index, source_indexes = _scatter_index(dim, index) - return input.at[input_index].set(src[source_indexes]) - - -@op(torch.ops.aten.scatter.value) -def _aten_scatter(input, dim, index, src, reduce=None): - input_index, source_indexes = _scatter_index(dim, index) - return input.at[input_index].set(src) - - -# aten.acosh -@op(torch.ops.aten.acosh) -def _aten_acosh(self): - return jnp.arccosh(self) - - -# aten.avg_pool2d_backward -# aten.col2im -# aten.avg_pool3d -# aten.round -@op(torch.ops.aten.round) -def _aten_round(input, decimals=0): - return jnp.round(input, decimals) - - -# aten.max -@op(torch.ops.aten.max) -def _aten_max(self, dim=None, keepdim=False): - return jnp.max( - self, axis=dim, keepdims=keepdim), jnp.argmax( - self, axis=dim, keepdims=keepdim) - - -# aten.maximum -@op(torch.ops.aten.maximum) -def _aten_maximum(self, other): - return jnp.maximum(self, other) - - -# aten.abs -@op(torch.ops.aten.abs) -def _aten_abs(self): - return jnp.abs(self) - - -# generate aten.amax only -@op(torch.ops.aten.amax) -def _aten_amax(self, dim=None, keepdim=False): - return _with_reduction_scalar(jnp.amax, self, dim, keepdim) - - -def _with_reduction_scalar(jax_func, self, dim, keepdim): - expanded = False - if self.ndim == 0: - # for self of rank 0: - # torch.any(x, 0), torch.any(x, -1) works; - # torch.any(x, 1) throws out of bounds, so it's - # behavior is the same as a jnp array of rank 1 - expanded = True - self = jnp.expand_dims(self, 0) - res = jax_func(self, axis=dim, keepdims=keepdim) - if expanded: - res = res.squeeze() - return res - -# aten.any -@op(torch.ops.aten.any) -def _aten_any(self, dim=None, keepdim=False): - return _with_reduction_scalar(jnp.any, self, dim, keepdim) - - -# aten.arange -@op(torch.ops.aten.arange) -def _aten_arange(start, - end=None, - step=1, - *, - dtype=None, - layout=None, - requires_grad=False, - device=None, - pin_memory=False): - if end is None: - end = start - start = 0 - dtype = tensor.t2j_dtype(dtype) - return jnp.arange( - start, - end, - step, - dtype=dtype, - ) - - -# aten.argmax -@op(torch.ops.aten.argmax) -def _aten_argmax(self, dim=None, keepdim=False): - return _with_reduction_scalar( - jnp.argmax, self, dim, keepdim) - - -# aten.as_strided -@op(torch.ops.aten.as_strided) -@op(torch.ops.aten.as_strided_copy) -def _aten_as_strided(x, sizes, strides, storage_offset=None): - ind = jnp.zeros(sizes, dtype=jnp.int32) - - for i, (size, stride) in enumerate(zip(sizes, strides)): - result_shape = (1,) * i + (size,) + (1,) * (len(sizes) - i - 1) - indexes = (jnp.arange(size) * stride).reshape(result_shape) - ind += indexes - - return jnp.ravel(x)[ind] - - -# aten.atan2 -@op(torch.ops.aten.atan2) -def _aten_atan2(self, other): - return jnp.arctan2(self, other) - - -# aten.bitwise_and -@op(torch.ops.aten.bitwise_and) -def _aten_bitwise_and(self, other): - return self & other - - -# aten.bitwise_or -@op(torch.ops.aten.bitwise_or) -def _aten_bitwise_or(self, other): - return self | other - - -# aten.bitwise_xor -@op(torch.ops.aten.bitwise_xor) -def _aten_bitwise_xor(self, other): - return self ^ other - - -# aten.clamp -@op(torch.ops.aten.clamp) -def _aten_clamp(self, min=None, max=None): - return jnp.clip(self, min, max) - - -# aten.constant_pad_nd -@op(torch.ops.aten.constant_pad_nd) -def _aten_constant_pad_nd(input, padding, value=0): - # NOTE: Torch padding is flat and reversed: (1, 1, 2, 2) - # means last dim get padded 1 in front and 1 in back; - # and second last dim get padded 2 in front and 2 in back. - # Jax padding tuple of 2-tuple: the same padding is - # [(0, 0), ..., (2,2), (1,1)] - m = len(padding) - rev_padding = [(padding[i - 1], padding[i]) for i in range(m - 1, 0, -2)] - pad_dim = tuple(([(0, 0)] * (len(input.shape) - m // 2)) + rev_padding) - return jnp.pad(input, pad_dim, mode="constant", constant_values=value) - - -# aten.convolution_backward -@op(torch.ops.aten.copy) -@op(torch.ops.aten.lift_fresh_copy) -def _aten_copy(x): - return jnp.copy(x) - - -@op(torch.ops.aten._cdist_forward) -def _aten_cdist_forward(x1, x2, p, compute_mode=''): - # x1 is B x P x M - # x2 is B x Q x M - # res is B x P x Q - x1 = jnp.expand_dims(x1, len(x1.shape) - 1) - x2 = jnp.expand_dims(x2, len(x2.shape) - 2) - return jnp.linalg.norm(x1 - x2, ord=p, axis=-1) - - -@op(torch.ops.aten._pdist_forward) -def _aten__pdist_forward(x, p): - pairwise_dists = _aten_cdist_forward(x, x, p) - condensed_dists = pairwise_dists[jnp.triu_indices( - pairwise_dists.shape[0], k=1)] - return condensed_dists - - -# aten.cos -@op(torch.ops.aten.cos) -def _aten_cos(input): - return jnp.cos(input) - - -# aten.cosh -@op(torch.ops.aten.cosh) -def _aten_cosh(input): - return jnp.cosh(input) - - -# aten.diagonal -@op(torch.ops.aten.diagonal) -def _aten_diagonal(input, offset=0, dim1=0, dim2=1): - return jnp.diagonal(input, offset, dim1, dim2) - - -# aten.empty_strided -# aten.eq -@op(torch.ops.aten.eq) -def _aten_eq(input1, input2): - return input1 == input2 - - -# aten.erf -@op(torch.ops.aten.erf) -def _aten_erf(x): - if x.dtype in (jnp.int32, jnp.int64): - x = x.astype(jnp.float32) - return jax.lax.erf(x) - - -# aten.exp -@op(torch.ops.aten.exp) -def _aten_exp(input): - return jnp.exp(input) - - -# aten.expm1 -@op(torch.ops.aten.expm1) -def _aten_expm1(input): - return jnp.expm1(input) - - -# aten.fill -@op(torch.ops.aten.fill) -@op(torch.ops.aten.full_like) -def _aten_fill(x, value, dtype=None, pin_memory=None, memory_format=None): - if dtype is None: - dtype = x.dtype - else: - dtype = tensor.t2j_dtype(dtype) - return jnp.full(x.shape, value, dtype) - - -# aten.flip -@op(torch.ops.aten.flip) -def _aten_flip(input, dims): - if dims is not None: - return jnp.flip(input, tuple(dims)) - else: - return jnp.flip(input) - - -# aten.floor -@op(torch.ops.aten.floor) -def _aten_floor(input): - return jnp.floor(input) - - -# aten.fmod -@op(torch.ops.aten.fmod) -def _aten_fmod(input, other): - return input - other * _aten_div(input, other, 'trunc') - - -# aten.gather -@op(torch.ops.aten.gather) -def _aten_gather(input, dim, index): - input_indexes, source_indexes = _scatter_index(dim, index) - return input[input_indexes] - - -# aten.ge -@op(torch.ops.aten.ge) -def _aten_ge(self, other): - return self >= other - - -@op(torch.ops.aten.glu) -@op(torch.ops.aten.glu.default) -def _aten_glu(x, dim=-1): - return jax.nn.glu(x, dim) - - -# aten.hardtanh -@op(torch.ops.aten.hardtanh) -def _aten_hardtanh(input, min_val=-1., max_val=1., inplace=False): - return jnp.clip(input, min_val, max_val) - - -# aten.isinf -@op(torch.ops.aten.isinf) -def _aten_isinf(input): - return jnp.isinf(input) - - -# aten.isnan -@op(torch.ops.aten.isnan) -def _aten_isnan(input): - return jnp.isnan(input) - - -@op(torch.ops.aten.le) -def _aten_le(self, other): - return self <= other - - -# aten.leaky_relu -@op(torch.ops.aten.leaky_relu) -def _aten_leaky_relu(x, negative_slope): - return jax.nn.leaky_relu(x, negative_slope) - - -# aten.log -@op(torch.ops.aten.log) -def _aten_log(x): - return jnp.log(x) - - -# aten.log10 -@op(torch.ops.aten.log10) -def _aten_log10(x): - return jnp.log10(x) - - -# aten.log1p -@op(torch.ops.aten.log1p) -def _aten_log1p(x): - return jnp.log1p(x) - - -# aten.log2 -@op(torch.ops.aten.log2) -def _aten_log2(x): - return jnp.log2(x) - - -# aten.logical_and -@op(torch.ops.aten.logical_and) -def _aten_logical_and(self, other): - return jnp.logical_and(self, other) - - -# aten.logical_or -@op(torch.ops.aten.logical_or) -def _aten_logical_or(self, other): - return jnp.logical_or(self, other) - - -# aten.logical_not -@op(torch.ops.aten.logical_not) -def _aten_logical_not(self): - return jnp.logical_not(self) - - -# aten.log_softmax -@op(torch.ops.aten._log_softmax) -def _aten_log_softmax(self, axis=-1, half_to_float=False): - return jax.nn.log_softmax(self, axis) - - -# aten.max_pool3d_backward -# aten.logical_xor -@op(torch.ops.aten.logical_xor) -def _aten_logical_xor(self, other): - return jnp.logical_xor(self, other) - - -# aten.max_pool2d_with_indices_backward -# aten.native_dropout -# aten.native_group_norm_backward -# aten.neg -@op(torch.ops.aten.neg) -def _aten_neg(x): - return -1 * x - - -# aten.nonzero -@op(torch.ops.aten.nonzero) -def _aten_nonzero(x): - index_tuple = jnp.nonzero(x) - index_tuple = [jnp.expand_dims(p, -1) for p in index_tuple] - return jnp.concatenate(index_tuple, axis=-1) - - -# aten.prod - - -@op(torch.ops.aten.prod) -def _aten_prod(self, dim=None, keepdim=False): - return jnp.prod(self, axis=dim, keepdims=keepdim) - - -# aten.rand -# aten.randn -# aten.randperm -# aten.reflection_pad3d -# aten.remainder -@op(torch.ops.aten.remainder) -def _aten_remainder(inputs, other): - return inputs % other - - -# aten.repeat -@op(torch.ops.aten.repeat) -def _aten_repeat(x, reps): - return jnp.tile(x, reps) - - -# aten.replication_pad2d -# aten.replication_pad3d -# aten.roll -@op(torch.ops.aten.roll) -def _aten_roll(input, shifts, dims=None): - return jnp.roll(input, shifts, dims) - - -# aten.scalar_tensor -# aten.slice_scatter -@op(torch.ops.aten.slice_scatter) -def _aten_slice_scatter(input, src, dim=0, start=None, end=None, step=1): - input_index = [] - for x in range(len(input.shape)): - if x == dim: - input_index.append(slice(start, end, step)) - else: - input_index.append(slice(None, None, None)) - return input.at[tuple(input_index)].set(src) - - -# aten.sort -# torch.sort(input, dim=-1, descending=False, stable=False, *, out=None) -@op(torch.ops.aten.sort) -def _aten_sort(a, dim=-1, descending=False, stable=False): - return ( - jnp.sort(a, axis=dim, stable=stable, descending=descending), - jnp.argsort(a, axis=dim, stable=stable, descending=descending), - ) - - -# aten.sym_size - - -# aten.topk -@op(torch.ops.aten.topk) -def _aten_topk(input, k, dim=None, largest=True, sorted=True, *, out=None): - """JAX top-k implementation using jax.lax.top_k for improved efficiency. - - Args: - input: The input JAX array. - k: The number of top elements to return. - dim: The dimension along which to find the top-k. If None, operates on the - flattened array. - largest: If True, returns the largest k elements. Otherwise, smallest k. - sorted: If True, returns the elements in sorted order. - - Returns: - A tuple (values, indices) containing: - - values: The top k values. - - indices: The indices of the top k values in the original array. - """ - if dim is None: - input = input.flatten() - dim = 0 - - if not largest: - input = -input # Find top-k of negated input if we want the smallest - - transpose_shape = None - if dim != -1 and dim != len(input.shape) - 1: - transpose_shape = list(range(len(input.shape))) - transpose_shape[dim], transpose_shape[-1] = (transpose_shape[-1], - transpose_shape[dim]) - input = jnp.transpose(input, transpose_shape) - - values, indices = jax.lax.top_k(input, k) - - if sorted: - values = jnp.sort(values, descending=True) - indices = jnp.take_along_axis( - indices, jnp.argsort(values, axis=-1, descending=True), axis=-1) - - if not largest: - values = -values # Negate values back if we found smallest - - if transpose_shape is not None: - values = jnp.transpose(values, transpose_shape) - indices = jnp.transpose(indices, transpose_shape) - - return values, indices - - -# aten.trunc -@op(torch.ops.aten.trunc) -def _aten_trunc(a): - return jnp.trunc(a) - - -@op(torch.ops.aten.unbind) -@op(torch.ops.aten.unbind_copy) -def _aten_unbind(a, dim=0): - return tuple( - _aten_squeeze_dim(jax.lax.index_in_dim(a, i, axis=dim), dim) - for i in range(a.shape[dim])) - - -# NOTE: skip aten.upsample_nearest2d and aten.upsample_bilinear2d -# despite those being core aten ops, they also have decompositions. -# here we are using torch decompositions. - - -# aten.where -@op(torch.ops.aten.where) -def _aten_where(condition, x, y): - return jnp.where(condition, x, y) - - -# aten.to.dtype -#Tensor(a) self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None -@op(torch.ops.aten.to.dtype) -def _aten_to_dtype(a, - dtype, - non_blocking=False, - copy=False, - memory_format=None): - jaxdtype = tensor.t2j_dtype(dtype) - return a.astype(jaxdtype) - - -# aten.to.device - - -#Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False -@op(torch.ops.aten.var_mean.correction) -def _aten_var_mean_correction(self, dim=None, correction=None, keepdim=False): - return (jnp.var(self, axis=dim, ddof=correction, - keepdims=keepdim), jnp.mean(self, dim, keepdims=keepdim)) - - -@op(torch.ops.aten.scalar_tensor) -def _aten_scalar_tensor(s, - dtype=None, - layout=None, - device=None, - pin_memory=None): - if dtype is not None: - dtype = tensor.t2j_dtype(dtype) - return jnp.array(s, dtype=dtype) - return jnp.array(s) - - -@op(torch.ops.aten.to.device) -def _aten_to_device(x,device, dtype): - return x - - -@op(torch.ops.aten.max_pool2d_with_indices_backward) -def max_pool2d_with_indices_backward_custom(grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices): - - """ - Approximates the gradient calculation of PyTorch's max_pool2d_with_indices_backward. - - Args: - grad_output: The gradient tensor from the preceding layer. - self: The input tensor on which the original max pooling was performed. - kernel_size: The size of the pooling window. - stride: The stride of the pooling window. - padding: The padding applied during max pooling. - dilation: The dilation factor for the pooling operation. - ceil_mode: Whether to use ceil or floor when calculating output shapes. - indices: The indices of the maximum values, as produced by max_pool2d_with_indices. - - Returns: - The calculated gradient with respect to the input (grad_input). - """ - - kH, kW = kernel_size - dH, dW = stride - padH, padW = padding - dilH, dilW = dilation - - # Calculate output shape (may need adjustment based on ceil_mode) - out_shape = jnp.array(self.shape) - grad_input = jnp.zeros_like(self) - - # Iterate over the flattened input and output tensors - for i, idx in enumerate(indices.flatten()): - # Calculate input coordinates corresponding to the maximum value - out_y, out_x = i // grad_output.shape[3], i % grad_output.shape[3] - in_y = out_y * dH - padH + out_y * (dilH - 1) - in_x = out_x * dW - padW + out_x * (dilW - 1) - - # Scatter the gradient to the appropriate input locations (handling potential overlaps) - for y in range(in_y, in_y + kH): - for x in range(in_x, in_x + kW): - if 0 <= y < grad_input.shape[2] and 0 <= x < grad_input.shape[3]: - grad_input = grad_input.at[y, x].add(grad_output.flatten()[i]) - - return grad_input - - -@op(torch.ops.aten._local_scalar_dense) -def _aten_local_scalar_dense(x): - return x.item() - -@op(torch.ops.aten.tensor_split.sections) -def _aten_tensor_split(ary, indices_or_sections, axis=0): - return jnp.array_split(ary, indices_or_sections, axis) - -@op(torch.ops.aten.outer) -def _aten_outer(a, b): - return jnp.outer(a, b) - -@op(torch.ops.aten.allclose) -def _aten_allclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False): - return jnp.allclose(input, other, rtol, atol, equal_nan) \ No newline at end of file diff --git a/experimental/torch_xla2/torch_xla2/decompositions.py b/experimental/torch_xla2/torch_xla2/decompositions.py index e85e49e13ee..81b48bb5da8 100644 --- a/experimental/torch_xla2/torch_xla2/decompositions.py +++ b/experimental/torch_xla2/torch_xla2/decompositions.py @@ -90,4 +90,21 @@ def _reflection_or_replication_pad( return result _try_register(aten.replication_pad1d, _replication_pad) -_try_register(aten.replication_pad3d, _replication_pad) \ No newline at end of file +_try_register(aten.replication_pad3d, _replication_pad) + +EXTRA_DECOMP = decomp.get_decompositions([ + torch.ops.aten.upsample_nearest2d, + torch.ops.aten._native_batch_norm_legit.no_stats, + torch.ops.aten._adaptive_avg_pool2d, + torch.ops.aten._adaptive_avg_pool3d, + torch.ops.aten.grid_sampler_2d, + torch.ops.aten.native_dropout, + torch.ops.aten.reflection_pad1d, + torch.ops.aten.reflection_pad2d, + torch.ops.aten.reflection_pad3d, + torch.ops.aten.replication_pad1d, + torch.ops.aten.replication_pad2d, + torch.ops.aten.replication_pad3d, +]) + +EXTRA_DECOMP[torch.ops.aten.uniform] = torch.ops.aten.rand \ No newline at end of file diff --git a/experimental/torch_xla2/torch_xla2/environment.py b/experimental/torch_xla2/torch_xla2/environment.py index 6a71c7d51c0..139597f9cb0 100644 --- a/experimental/torch_xla2/torch_xla2/environment.py +++ b/experimental/torch_xla2/torch_xla2/environment.py @@ -1,26 +1,2 @@ -import jax - - -class Environment: - """This class holds a set of configurations and "globals" needed - - for executing torch program using jax. - Things included so far: - - op registry - PRNGKey - Configs - - Also helper functions to manipulate those. - """ - - _prng_key: jax.random.PRNGKey - - - def __init__(self, random_seed): - self._prng_key = jax.random.PRNGKey(random_seed) - - def get_and_rotate_prng_key(self): - self._prng_key, key = jax.random.split(self._prng_key) diff --git a/experimental/torch_xla2/torch_xla2/export.py b/experimental/torch_xla2/torch_xla2/export.py index 64a3f9d175c..78430a6d537 100644 --- a/experimental/torch_xla2/torch_xla2/export.py +++ b/experimental/torch_xla2/torch_xla2/export.py @@ -2,146 +2,12 @@ """Utilities for exporting a torch program to jax/stablehlo.""" import copy from typing import Any, Dict, Tuple -import jax import torch -from torch.fx import _pytree as fx_pytree -from torch_xla2 import ops_registry, tensor +from torch_xla2.ops import ops_registry +from torch_xla2 import tensor from torch.utils import _pytree as pytree -class JaxProgram: - - def _wrap_inputs(self, xs, allow_torch_tensor=False): - - def convert(t): - if isinstance(t, tensor.XLATensor2): - return t - if isinstance(t, torch.Tensor): - if allow_torch_tensor: - return tensor.move_to_device(t) - else: - raise ValueError('Regular torch.Tensor is not allowed.') - if isinstance(t, jax.Array): - return tensor.XLATensor2(t) - return t - - return jax.tree_util.tree_map(convert, xs) - - def _unwrap_outputs(self, xs): - - def convert(t): - if isinstance(t, tensor.XLATensor2): - return t.jax() - if isinstance(t, torch.Tensor): - raise ValueError('Regular torch.Tensor is not allowed.') - return t - - return jax.tree_util.tree_map(convert, xs) - - def __init__( - self, - exported_program, - param_buffer_values, - ordered_tensor_constants, - ): - - self.param_buffer_values = self._wrap_inputs( - param_buffer_values, allow_torch_tensor=True) - self.ordered_tensor_constants = self._wrap_inputs( - ordered_tensor_constants, allow_torch_tensor=True) - self.exported_program = exported_program - - def __hash__(self): - return hash(self.exported_program) - - @property - def example_inputs(self): - args, kwargs = self.exported_program.example_inputs - args = pytree.tree_map(tensor.t2j, args) - kwargs = pytree.tree_map(tensor.t2j, kwargs) - return args, kwargs - - def flatten_inputs(self, args, kwargs): - if args is None: - args = tuple() - if kwargs is None: - kwargs = {} - - if (in_spec := self.exported_program.call_spec.in_spec) is not None: - if (in_spec.type == tuple and len(in_spec.children_specs) == 2 and - in_spec.children_specs[0].type == tuple and - in_spec.children_specs[1].type == dict): - # NOTE: this is the case where in_spec is for both args and kwargs - return fx_pytree.tree_flatten_spec((args, kwargs), in_spec) - return fx_pytree.tree_flatten_spec(args, in_spec) - return copy.deepcopy(args) - - def unflatten_outputs(self, res): - return pytree.tree_unflatten(res, self.exported_program.call_spec.out_spec) - - def __call__(self, *args, **kwargs): - - inputs = self.flatten_inputs(args, kwargs) - res = self.flatten_callable(*inputs) - res = self.unflatten_outputs(res) - - return res - - @property - def flatten_callable(self): - - def func(*inputs: jax.Array): - nonlocal self - inputs = self._wrap_inputs(inputs) - num_mutations = len( - self.exported_program.graph_signature.buffers_to_mutate) - res = torch.fx.Interpreter(self.exported_program.graph_module).run( - *self.param_buffer_values, - *inputs, - *self.ordered_tensor_constants, - enable_io_processing=False, - ) - res = res[num_mutations:] - res = self._unwrap_outputs(res) - return res - - return func - - def jit(self, *args, **kwargs): - """Returns `jax.jit(self, *args, **kwargs)`.""" - return jax.jit(self, *args, **kwargs) - - def jit_lower(self, *args, **kwargs): - """Returns `jax.jit(self, *args, **kwargs).lower(...)` with example_inputs used in export.""" - example_args, example_kwargs = self.example_inputs - return self.jit(*args, **kwargs).lower(*example_args, **example_kwargs) - - -def exported_program_to_jax_program(ep): - """exported_program_to_jax_program. - - Args: - ep: torch.export.ExportedProgram - - Returns: - JaxProgram - - """ - if torch.__version__ >= '2.2': - ep = ep.run_decompositions() - - param_buffer_keys = ep.graph_signature.parameters + ep.graph_signature.buffers - param_buffer_values = tuple(ep.state_dict[key] for key in param_buffer_keys) - - if hasattr(ep.graph_signature, 'lifted_tensor_constants'): - ordered_tensor_constants = tuple( - ep.tensor_constants[name] - for name in ep.graph_signature.lifted_tensor_constants) - else: - ordered_tensor_constants = tuple() - - return JaxProgram(ep, param_buffer_values, ordered_tensor_constants) - DEBUG = False @@ -149,6 +15,11 @@ def exported_program_to_jax_program(ep): class JaxInterpreter(torch.fx.Interpreter): """Experimental.""" + def __init__(self, graph_module): + super().__init__(graph_module) + import torch_xla2.ops.jaten + import torch_xla2.ops.jtorch + def call_function(self, target, args: Tuple, kwargs: Dict) -> Any: if not isinstance(target, (torch._ops.OpOverloadPacket, torch._ops.OpOverload)): @@ -157,7 +28,9 @@ def call_function(self, target, args: Tuple, kwargs: Dict) -> Any: if DEBUG: print('Running ', target.name(), '--------') - op = ops_registry.lowerings.lookup(target) + op = ops_registry.all_aten_ops.get(target) + if op is None: + op = ops_registry.all_aten_ops.get(target.overloadpacket) if op is None: print(target.name(), target.tags) raise RuntimeError('No lowering found for', target.name()) diff --git a/experimental/torch_xla2/torch_xla2/extra.py b/experimental/torch_xla2/torch_xla2/extra.py deleted file mode 100644 index ebfdb96b1db..00000000000 --- a/experimental/torch_xla2/torch_xla2/extra.py +++ /dev/null @@ -1,62 +0,0 @@ -import jax -import jax.numpy as jnp -import functools -import torch -from torch.utils import _pytree as pytree -from torch_xla2 import tensor - -def torch_view(t): - # t is an object from jax land - # view it as-if it's a torch land object - if isinstance(t, jax.Array): - return tensor.XLATensor2(t) - if isinstance(t, type(jnp.int32)): - return tensor.t2j_type(t) - if callable(t): - def new_t(*args, **kwargs): - # args, kwargs are torch-land - args, kwargs = pytree.tree_map(jax_view, (args, kwargs)) - # now they are objs in jax-land - res = t(*args, **kwargs) # t is jax callable - # res is jax-land obj - return pytree.tree_map(torch_view, res) - return new_t - # regular types are not changed - return t - - -def jax_view(t): - # t is an object from torch land - # view it as-if it's a jax land object - if isinstance(t, torch.Tensor): - assert isinstance(t, tensor.XLATensor2) - return t.jax() - if isinstance(t, type(torch.int32)): - return tensor.j2t_dtype(t) - if callable(t): - def new_t(*args, **kwargs): - # args, kwargs are jax-land - args, kwargs = pytree.tree_map(torch_view, (args, kwargs)) - # now they are objs in torch-land - res = t(*args, **kwargs) - # res is torch-land obj - return pytree.tree_map(jax_view, res) - return new_t - # regular types are not changed - return t - -def call_jax(jax_func, *args, **kwargs): - return torch_view(jax_func)(*args, **kwargs) - - -def call_torch(torch_func, *args, **kwargs): - return jax_view(torch_func)(*args, **kwargs) - - -fori_loop = torch_view(jax.lax.fori_loop) - -def jax_jit(torch_function, kwargs_for_jax_jit=None): - kwargs_for_jax_jit = kwargs_for_jax_jit or {} - jax_func = jax_view(torch_function) - jitted = jax.jit(jax_func, **kwargs_for_jax_jit) - return torch_view(jitted) diff --git a/experimental/torch_xla2/torch_xla2/functions.py b/experimental/torch_xla2/torch_xla2/functions.py deleted file mode 100644 index 94320fd7cb2..00000000000 --- a/experimental/torch_xla2/torch_xla2/functions.py +++ /dev/null @@ -1,135 +0,0 @@ -"""Tensor constructor overrides""" -import functools -import logging -from typing import Callable, Optional, ParamSpec, Sequence - -import jax -import torch -import jax.numpy as jnp -from torch_xla2 import tensor - -registry = {} - -P = ParamSpec('P') - - -def register_function(torch_func: Callable[P, torch.Tensor]): - """Registers a function as the JAX implementation of a torch function.""" - - def decorator(jax_impl: Callable[P, jax.Array]): - registry[torch_func] = jax_impl - return jax_impl - - return decorator - - -def convert_dtype(use_default_dtype: bool = True): - """Converts `dtype` kwarg of function from torch to JAX. - - Args: - use_default_dtype: Whether to use torch default dtype if none is provided. - - Returns: - A decorator that wraps a JAX implementation of a torch function. - """ - - def decorator(func: Callable[P, torch.Tensor]): - - @functools.wraps(func) - def wrapper(*args: P.args, - dtype: Optional[torch.dtype] = None, - **kwargs: P.kwargs): - if not dtype and use_default_dtype: - dtype = torch.get_default_dtype() - jax_dtype = tensor.t2j_dtype(dtype) - - return func(*args, dtype=jax_dtype, **kwargs) - - return wrapper - - return decorator - - -@register_function(torch.tensor) -@convert_dtype(use_default_dtype=False) # Attempt to infer type from elements -def _tensor(data, *, dtype=None, **kwargs): - python_types_to_torch_types = { - bool: jnp.bool, - int: jnp.int64, - float: jnp.float32, - complex: jnp.complex64, - } - if not dtype: - leaves = jax.tree_util.tree_leaves(data) - if len(leaves) > 0: - dtype = python_types_to_torch_types.get(type(leaves[0])) - - return jnp.array( - data, dtype=dtype or tensor.t2j_dtype(torch.get_default_dtype())) - - -@register_function(torch.ones) -@convert_dtype() -def _ones(*size: int, dtype=None, **kwargs): - return jnp.ones(size, dtype) - - -@register_function(torch.zeros) -@convert_dtype() -def _zeros(*size: int, dtype=None, **kwargs): - return jnp.zeros(size, dtype) - - -@register_function(torch.eye) -@convert_dtype() -def _eye(n: int, m: Optional[int] = None, *, dtype=None, **kwargs): - return jnp.eye(n, m, dtype=dtype) - - -@register_function(torch.full) -@convert_dtype() -def _full(size: Sequence[int], fill_value, *, dtype=None, **kwargs): - # TODO: handle torch.Size - return jnp.full(size, fill_value, dtype=dtype) - -@register_function(torch.allclose) -def _aten_allclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False): - return jnp.allclose(input, other, rtol, atol, equal_nan) - -@register_function(torch.angle) -def _torch_angle(input): - return jnp.angle(input) - - -@register_function(torch.argsort) -def _torch_argsort(input, dim=-1, descending=False, stable=False): - expanded = False - if input == 0: - # for self of rank 0: - # torch.any(x, 0), torch.any(x, -1) works; - # torch.any(x, 1) throws out of bounds, so it's - # behavior is the same as a jnp array of rank 1 - expanded = True - input = jnp.expand_dims(input, 0) - res = jnp.argsort(input, axis=dim, descending=descending, - stable=stable) - if expanded: - res = res.squeeze() - return res - - - -class XLAFunctionMode(torch.overrides.TorchFunctionMode): - """Context manager that dispatches torch function calls to JAX.""" - - def __torch_function__(self, - func, - types, - args=(), - kwargs=None) -> torch.Tensor: - jax_func = registry.get(func) - if not jax_func: - return func(*args, **(kwargs or {})) - - # TODO: unwrap args here or in implementations? - return tensor.wrap(jax_func(*args, **(kwargs or {}))) diff --git a/experimental/torch_xla2/torch_xla2/interop.py b/experimental/torch_xla2/torch_xla2/interop.py new file mode 100644 index 00000000000..fbcd47922e1 --- /dev/null +++ b/experimental/torch_xla2/torch_xla2/interop.py @@ -0,0 +1,65 @@ +import functools +import torch +import jax +import jax.numpy as jnp +from jax import tree_util as pytree +from torch_xla2 import tensor +import torch_xla2 + +from torch_xla2.types import JaxValue, TorchValue, JaxCallable, TorchCallable + + + + +def torch_view(t: JaxValue) -> TorchValue: + # t is an object from jax land + # view it as-if it's a torch land object + if isinstance(t, jax.Array): + # TODO + return tensor.XLATensor2(t, torch_xla2.default_env()) + if isinstance(t, type(jnp.int32)): + return tensor.t2j_type(t) + if callable(t): # t is a JaxCallable + return functools.partial(call_jax, t) + # regular types are not changed + return t + + +def jax_view(t: TorchValue) -> JaxValue: + # t is an object from torch land + # view it as-if it's a jax land object + if isinstance(t, torch.Tensor): + assert isinstance(t, tensor.XLATensor2) + return t.jax() + if isinstance(t, type(torch.int32)): + return tensor.j2t_dtype(t) + + # torch.nn.Module needs special handling + if not isinstance(t, torch.nn.Module) and callable(t): # t is a TorchCallable + return functools.partial(call_torch, t) + # regular types are not changed + return t + + +def call_jax(jax_func: JaxCallable, + *args: TorchValue, + **kwargs: TorchValue) -> TorchValue: + args, kwargs = pytree.tree_map(jax_view, (args, kwargs)) + res: JaxValue = jax_func(*args, **kwargs) + return torch_view(res) + + +def call_torch(torch_func: TorchCallable, *args: JaxValue, **kwargs: JaxValue) -> JaxValue: + args, kwargs = pytree.tree_map(torch_view, (args, kwargs)) + with torch_xla2.default_env(): + res: TorchValue = torch_func(*args, **kwargs) + return jax_view(res) + + +fori_loop = torch_view(jax.lax.fori_loop) + +def jax_jit(torch_function, kwargs_for_jax_jit=None): + kwargs_for_jax_jit = kwargs_for_jax_jit or {} + jax_func = jax_view(torch_function) + jitted = jax.jit(jax_func, **kwargs_for_jax_jit) + return torch_view(jitted) diff --git a/experimental/torch_xla2/torch_xla2/ops/__init__.py b/experimental/torch_xla2/torch_xla2/ops/__init__.py index e69de29bb2d..abefc8344b1 100644 --- a/experimental/torch_xla2/torch_xla2/ops/__init__.py +++ b/experimental/torch_xla2/torch_xla2/ops/__init__.py @@ -0,0 +1,9 @@ +def all_aten_jax_ops(): + # to load the ops + import torch_xla2.jaten # type: ignore + import torch_xla2.ops_registry # type: ignore + return { + key: val.func + for key, val in torch_xla2.ops_registry.all_aten_ops + if val.is_jax_function + } \ No newline at end of file diff --git a/experimental/torch_xla2/torch_xla2/ops/jaten.py b/experimental/torch_xla2/torch_xla2/ops/jaten.py index a30fae82de8..f6adc702a14 100644 --- a/experimental/torch_xla2/torch_xla2/ops/jaten.py +++ b/experimental/torch_xla2/torch_xla2/ops/jaten.py @@ -1,5 +1,14 @@ -"""This module contains implementation of ATen ops.""" +"""Torch ops implemented using jax.""" + +import sys + +import jax +from jax import numpy as jnp +import numpy as np import torch +from torch_xla2.ops import ops_registry +from torch_xla2 import tensor +from torch_xla2.ops import op_base # Keys are OpOverload, value is a callable that takes # XLATensor2 @@ -9,29 +18,1933 @@ # and need to be implemented in jax mutation_ops_to_functional = { - torch.ops.aten.add_: torch.ops.aten.add, - torch.ops.aten.sub_: torch.ops.aten.sub, - torch.ops.aten.mul_: torch.ops.aten.mul, - torch.ops.aten.div_: torch.ops.aten.div, - torch.ops.aten.pow_: torch.ops.aten.pow, - torch.ops.aten.lt_: torch.ops.aten.lt, - torch.ops.aten.le_: torch.ops.aten.le, - torch.ops.aten.gt_: torch.ops.aten.gt, - torch.ops.aten.ge_: torch.ops.aten.ge, - torch.ops.aten.eq_: torch.ops.aten.eq, - torch.ops.aten.ne_: torch.ops.aten.ne, + torch.ops.aten.add_: torch.ops.aten.add, + torch.ops.aten.sub_: torch.ops.aten.sub, + torch.ops.aten.mul_: torch.ops.aten.mul, + torch.ops.aten.div_: torch.ops.aten.div, + torch.ops.aten.pow_: torch.ops.aten.pow, + torch.ops.aten.lt_: torch.ops.aten.lt, + torch.ops.aten.le_: torch.ops.aten.le, + torch.ops.aten.gt_: torch.ops.aten.gt, + torch.ops.aten.ge_: torch.ops.aten.ge, + torch.ops.aten.eq_: torch.ops.aten.eq, + torch.ops.aten.ne_: torch.ops.aten.ne, + torch.ops.aten.uniform_: torch.ops.aten.uniform, } def make_mutation(op): + return op_base.InplaceOp(mutation_ops_to_functional[op], position_to_mutate=0) - def f(*args, **kwargs): - res = mutation_ops_to_functional[op](*args, **kwargs) - args[0].copy_(res) - return args[0] - return f +for op in mutation_ops_to_functional.keys(): + ops_registry.register_torch_dispatch_op( + op, make_mutation(op), is_jax_function=False + ) -for op in mutation_ops_to_functional.keys(): - all_ops[op] = make_mutation(op) +def op(*aten, **kwargs): + def inner(func): + for a in aten: + ops_registry.register_torch_dispatch_op(a, func, **kwargs) + return func + + return inner + + +@op( + torch.ops.aten.view_copy, + torch.ops.aten.view, + torch.ops.aten._unsafe_view, + torch.ops.aten.reshape, +) +def _aten_unsafe_view(x, shape): + return jnp.reshape(x, shape) + + +@op(torch.ops.aten.add.Tensor) +@op(torch.ops.aten.add.Scalar) +def _aten_add(x, y, *, alpha=1): + """if isinstance(x, jnp.ndarray) and isinstance(y, jnp.ndarray): + + assert x.dtype == y.dtype, (x.dtype, y.dtype) + """ + return x + y * alpha + + +@op(torch.ops.aten.copy_, torch.ops.aten.copy_.default, is_jax_function=False) +def _aten_copy(x, y, memory_format=None): + if isinstance(x, tensor.XLATensor2): + x._elem = y._elem + elif isinstance(x, tensor.SliceView): + x.mutate(y) + return x + + +@op(torch.ops.aten.clone) +@op(torch.ops.aten.clone.default) +def _aten_clone(x, memory_format=None): + return jnp.copy(x) + + +@op(torch.ops.aten.full) +def _aten_full(size, value, **kwargs): + return jnp.full(size, value) + + +@op(torch.ops.aten.index_copy) +def _aten_index_copy(x, dim, indexes, source): + # return jax.lax.scatter(x, index, dim) + dims = [] + for i in range(len(x.shape)): + if i == dim: + dims.append(indexes) + else: + dims.append(slice(None, None, None)) + return x.at[dim].set(source) + + +@op(torch.ops.aten.select) +@op(torch.ops.aten.index_select) +@op(torch.ops.aten.select_copy) +def _aten_index_select(x, dim, indexes): + dims = [] + for i in range(len(x.shape)): + if i == dim: + dims.append(indexes) + else: + dims.append(slice(None, None, None)) + return x[tuple(dims)] + + +@op(torch.ops.aten.mean) +def _aten_mean(x, dim=None, keepdim=False): + return jnp.mean(x, dim, keepdims=keepdim) + + +def _torch_binary_scalar_type(scalar, tensor): + if "float" in str(tensor.dtype): + return tensor.dtype + + if isinstance(scalar, int): + if "int" in str(tensor.dtype): + return tensor.dtype + + return jnp.float32 + + +@op(torch.ops.aten.sub.Tensor) +@op(torch.ops.aten.sub.Scalar) +def _aten_sub(x, y): + if isinstance(x, float): + dtype = _torch_binary_scalar_type(x, y) + x = jnp.array(x, dtype=dtype) + if isinstance(y, float): + dtype = _torch_binary_scalar_type(y, x) + y = jnp.array(y, dtype=dtype) + return x - y + + +@op(torch.ops.aten.mm) +def _aten_mm(x, y): + res = x @ y + return res + + +@op(torch.ops.aten.mul.Tensor, torch.ops.aten.mul.Scalar) +def _aten_mul(x, y): + return x * y + + +@op(torch.ops.aten.silu) +def _aten_silu(x): + return jax.nn.silu(x) + + +@op(torch.ops.aten.t) +def _aten_t(x): + return jnp.transpose(x) + + +@op(torch.ops.aten.transpose) +@op(torch.ops.aten.transpose_copy) +def _aten_transpose(x, dim0, dim1): + shape = list(range(len(x.shape))) + shape[dim0], shape[dim1] = shape[dim1], shape[dim0] + return jnp.transpose(x, shape) + + +@op(torch.ops.aten.triu) +def _aten_triu(m, k): + return jnp.triu(m, k) + + +@op(torch.ops.aten.slice) +@op(torch.ops.aten.slice_copy) +def _aten_slice(self, dim=0, start=None, end=None, step=1): + if end == sys.maxsize: + end = self.shape[dim] + sl = slice(start, end, step) + dims = [] + for i in range(len(self.shape)): + if i == dim: + dims.append(sl) + else: + dims.append(slice(None, None, None)) + return self[tuple(dims)] + + +@op(torch.ops.aten.detach) +def _aten_detach(self): + return self + + +@op(torch.ops.aten.view_as_real) +def _aten_view_as_real(x): + real = jnp.real(x) + im = jnp.imag(x) + res = jnp.stack([real, im], -1) + return res + + +@op(torch.ops.aten.stack) +def _aten_stack(tensors, dim=0): + return jnp.stack(tensors, dim) + + +@op(torch.ops.aten._softmax) +def _aten_softmax(x, dim, halftofloat): + return jax.nn.softmax(x, dim) + + +@op(torch.ops.aten.pow) +def _aten_pow(x, y): + if isinstance(y, int): + y = float(y) + return jnp.power(x, y) + + +@op(torch.ops.aten.view_as_complex) +def _aten_view_as_complex(input): + if input.dtype == jnp.bfloat16: + input = input.astype(jnp.float32) + x, y = input[..., 0], input[..., 1] + return jax.lax.complex(x, y) + + +@op(torch.ops.aten.div) +def _aten_div(x, y, rounding_mode=""): + res = x / y + if rounding_mode == "trunc": + res = jnp.trunc(res) + return res + + +@op(torch.ops.aten.div_, is_jax_function=False) +def _aten_div_(x, y, rounding_mode=""): + x._elem = _aten_div(x._elem, y._elem, rounding_mode) + return x + + +@op(torch.ops.aten.true_divide) +def _aten_true_divide(x, y): + return x / y + + +@op(torch.ops.aten.bmm) +def _aten_bmm(x, y): + res = x @ y + return res + # return jnp.einsum('bnm,bmk->bnk', x, y) + + +@op(torch.ops.aten.embedding) +# embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) +def _aten_embedding(a, w, padding_idx=-1): + return jnp.take(a, w, axis=0) + + +@op(torch.ops.aten.rsqrt) +def _aten_rsqrt(x): + if isinstance(x, int): + x = float(x) + if x.dtype == jnp.int32: + x = x.astype(jnp.float32) + return jax.lax.rsqrt(x) + + +@op(torch.ops.aten.expand) +@op(torch.ops.aten.expand_copy) +def _aten_expand(x, dims): + def fix_dims(d, xs): + if d == -1: + return xs + return d + + dims = [fix_dims(p, s) for p, s in zip(dims, x.shape)] + return jnp.broadcast_to(x, dims) + + +@op(torch.ops.aten.dot) +def _aten_dot(x, y): + return jnp.dot(x, y) + + +@op(torch.ops.aten._to_copy) +def _aten__to_copy(self, **kwargs): + dtype = tensor.t2j_dtype(kwargs["dtype"]) + if dtype != self.dtype: + return self.astype(dtype) + return jnp.copy(self) + + +@op(torch.ops.aten.empty) +def _aten_empty(sizes, **kwargs): + return jnp.zeros(sizes) + + +@op(torch.ops.aten.index_put_) +@op(torch.ops.aten.index_put) +def _aten_index_put(self, indexes, values, accumulate=False): + indexes = [slice(None, None, None) if i is None else i for i in indexes] + indexes = tuple(indexes) + if accumulate: + return self.at[indexes].add(values) + else: + return self.at[indexes].set(values) + + +@op(torch.ops.aten.index) +@op(torch.ops.aten._unsafe_index) +@op(torch.ops.aten.index.Tensor) +def _aten_index(self, indexes): + print(indexes) + indexes = [slice(None, None, None) if i is None else i for i in indexes] + indexes = tuple(indexes) + return self[indexes] + + +@op(torch.ops.aten.split) +@op(torch.ops.aten.split_copy) +@op(torch.ops.aten.split_with_sizes) +def split_with_sizes(x, sizes, dim=0): + """Splits an array `x` into sub-arrays based on static sizes `sizes`. + + Args: + x: The input array to split. + sizes: A 1D array of integer sizes for each sub-array. + + Returns: + A list of sub-arrays. + """ + if isinstance(sizes, int): + # split equal size + new_sizes = [sizes] * (x.shape[dim] // sizes) + sizes = new_sizes + rank = x.ndim + splits = np.cumsum(sizes) # Cumulative sum for split points + + def make_range(rank, dim, start, end): + res = [slice(None, None, None)] * rank + res[dim] = slice(start, end) + return tuple(res) + + return [ + x[make_range(rank, dim, start, end)] + for start, end in zip([0] + list(splits[:-1]), splits) + ] + + +@op(torch.ops.aten.permute) +@op(torch.ops.aten.permute_copy) +def permute(t, dims): + return jnp.transpose(t, dims) + + +@op(torch.ops.aten.unsqueeze) +@op(torch.ops.aten.unsqueeze_copy) +@op(torch.ops.aten.unsqueeze.default) +def _aten_unsqueeze(self, dim): + if dim < 0: + dim += self.ndim + 1 + return jnp.expand_dims(self, dim) + + +@op(torch.ops.aten.ne) +def _aten_ne(x, y): + return jnp.not_equal(x, y) + + +@op(torch.ops.aten.cumsum) +def _aten_cumsum(x, y, dtype=None): + if dtype: + dtype = tensor.t2j_dtype(dtype) + res = jnp.cumsum(x, y, dtype) + return res + + +@op(torch.ops.aten.native_layer_norm) +def _aten_native_layer_norm( + input, normalized_shape, weight=None, bias=None, eps=1e-5 +): + """Implements layer normalization in Jax as defined by `aten::native_layer_norm`. + + Args: + input: The input tensor. + normalized_shape: A list of integer dimensions to be normalized over. + weight: Optional weight tensor for the affine transformation. + bias: Optional bias tensor for the affine transformation. + eps: A small epsilon value for numerical stability. + + Returns: + output: The normalized tensor. + mean: The calculated mean tensor. + std: The calculated standard deviation tensor. + """ + if isinstance(normalized_shape, int): + normalized_shape = [normalized_shape] + axis = [i for i, d in enumerate(input.shape) if d in normalized_shape] + + # Calculate mean and standard deviation + mean = jnp.mean(input, axis=axis, keepdims=True) + var = jnp.var(input, axis=axis, keepdims=True) + rstd = jax.lax.rsqrt(var + eps) + + # Normalize the input + norm_x = (input - mean) * rstd + + # Apply affine transformation (if provided) + if weight is not None: + norm_x *= weight + if bias is not None: + norm_x += bias + return norm_x, mean, rstd + + +# - func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor +@op(torch.ops.aten.addmm) +@op(torch.ops.aten.addmv) +def _aten_addmm(self, mat1, mat2, *, beta=1.0, alpha=1.0): + alpha = jnp.array(alpha).astype(mat1.dtype) + beta = jnp.array(beta).astype(mat1.dtype) + self *= beta + self += alpha * jnp.matmul(mat1, mat2) + return self + + +@op(torch.ops.aten.addbmm.default) +def _aten_addbmm(input, batch1, batch2, *, beta=1, alpha=1): + alpha = jnp.array(alpha).astype(batch1.dtype) + beta = jnp.array(beta).astype(batch1.dtype) + mm = jnp.einsum("bxy, byz -> xz", batch1, batch2) + return jax.lax.cond( + beta == 0, lambda: alpha * mm, lambda: beta * input + alpha * mm + ) + + +@op(torch.ops.aten.gelu) +def _aten_gelu(self, *, approximate="none"): + approx = approximate == "tanh" + return jax.nn.gelu(self, approx) + + +@op(torch.ops.aten.squeeze) +@op(torch.ops.aten.squeeze_copy) +def _aten_squeeze_dim(self, dim): + """Squeezes a Jax tensor by removing a single dimension of size 1. + + Args: + self: The input tensor. + dim: The dimension to squeeze. + + Returns: + The squeezed tensor with the specified dimension removed if it is 1, + otherwise the original tensor is returned. + """ + + # Validate input arguments + if not isinstance(self, jnp.ndarray): + raise TypeError(f"Expected a Jax tensor, got {type(self)}.") + if isinstance(dim, int): + dim = [dim] + + # Check if the specified dimension has size 1 + if all([self.shape[d] != 1 for d in dim]): + return self + + # Use slicing to remove the dimension if it is 1 + new_shape = list(self.shape) + + def fix_dim(p): + if p < 0: + return p + len(self.shape) + return p + + dim = [fix_dim(d) for d in dim] + new_shape = [p for i, p in enumerate(self.shape) if i not in dim or p != 1] + return self.reshape(new_shape) + + +@op(torch.ops.aten.convolution) +def _aten_convolution( + input, + weight, + bias, + stride, + padding, + dilation, + transposed, + output_padding, + groups, +): + if transposed: + raise NotImplementedError("Transposed convolution is not implemented.") + + def make_padding(padding): + return ((p, p) for p in padding) + + def create_default_conv_dimension_numbers(num_spatial_dims): + # Ref: https://github.com/openxla/xla/blob/main/xla/client/xla_builder.cc#L4211 + # (batch dimension, feature dimension, spatial dimensions...) + lhs_spec = [0, 1] + # (out feature dimension, in feature dimension, spatial dimensions...) + rhs_spec = [0, 1] + # (batch dimension, feature dimension, spatial dimensions...) + out_spec = [0, 1] + for i in range(0, num_spatial_dims): + lhs_spec.append(i + 2) + rhs_spec.append(i + 2) + out_spec.append(i + 2) + return jax.lax.ConvDimensionNumbers( + *map(tuple, (lhs_spec, rhs_spec, out_spec)) + ) + + res = jax.lax.conv_general_dilated( + input, + weight, + stride, + make_padding(padding), + lhs_dilation=(1,) * len(stride), + rhs_dilation=dilation, + dimension_numbers=create_default_conv_dimension_numbers(len(stride)), + feature_group_count=groups, + batch_group_count=1, + ) + + if bias is not None: + # TODO(qihqi): bias always on channel? + if len(bias.shape) == 1: + shape = [1] * len(res.shape) + shape[1] = bias.shape[0] + bias = bias.reshape(tuple(shape)) + res = res + bias + return res + + +# _native_batch_norm_legit(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps) +@op(torch.ops.aten._native_batch_norm_legit) +def _aten__native_batch_norm_legit( + input, weight, bias, running_mean, running_var, training, momentum, eps +): + return _aten__native_batch_norm_legit_no_training( + input, weight, bias, running_mean, running_var, momentum, eps + ) + + +@op(torch.ops.aten._native_batch_norm_legit_no_training) +def _aten__native_batch_norm_legit_no_training( + input, weight, bias, running_mean, running_var, momentum, eps +): + if weight is None: + weight = jnp.ones_like(running_mean) + if bias is None: + bias = jnp.zeros_like(running_mean) + + def broadcast(t): + return jax.lax.broadcast_in_dim(t, input.shape, broadcast_dimensions=(1,)) + + if running_mean is not None: + a = input - broadcast(running_mean) + else: + a = input + if running_var is not None: + b = broadcast(jnp.sqrt(running_var + eps)) + else: + b = broadcast(jnp.sqrt(eps)) + return ( + a / b * broadcast(weight) + broadcast(bias), + jnp.array([]), + jnp.array([]), + ) + + +@op(torch.ops.aten.relu) +def _aten_relu(self): + return jax.nn.relu(self) + + +@op(torch.ops.aten.cat) +def _aten_cat(tensors, dims=0): + return jnp.concatenate(tensors, dims) + + +@op(torch.ops.aten.max_pool2d_with_indices) +@op(torch.ops.aten.max_pool3d_with_indices) +def _aten_max_pool2d_with_indices( + inputs, kernel_size, strides, padding=0, dilation=1, ceil_mode=False +): + num_batch_dims = len(inputs.shape) - len(kernel_size) - 1 + kernel_size = tuple(kernel_size) + strides = tuple(strides) + if isinstance(padding, int): + padding = tuple((padding, padding) for _ in range(len(kernel_size))) + elif isinstance(padding, list): + padding = tuple((p, p) for p in padding) + + window_shape = kernel_size + num_batch_dims = inputs.ndim - (len(window_shape) + 1) + strides = strides or (1,) * len(window_shape) + assert len(window_shape) == len( + strides + ), f"len({window_shape}) must equal len({strides})" + strides = (1,) * (1 + num_batch_dims) + strides + dims = (1,) * (1 + num_batch_dims) + window_shape + + is_single_input = False + if num_batch_dims == 0: + # add singleton batch dimension because lax.reduce_window always + # needs a batch dimension. + inputs = inputs[None] + strides = (1,) + strides + dims = (1,) + dims + is_single_input = True + + assert inputs.ndim == len(dims), f"len({inputs.shape}) != len({dims})" + if not isinstance(padding, str): + padding = tuple(map(tuple, padding)) + assert len(padding) == len(window_shape), ( + f"padding {padding} must specify pads for same number of dims as " + f"window_shape {window_shape}" + ) + assert all( + [len(x) == 2 for x in padding] + ), f"each entry in padding {padding} must be length 2" + padding = ((0, 0), (0, 0)) + padding + + indices = jnp.arange(np.prod(inputs.shape)).reshape(inputs.shape) + + def reduce_fn(a, b): + ai, av = a + bi, bv = b + which = av > bv + return jnp.where(which, ai, bi), jnp.where(which, av, bv) + + init_val = -jnp.inf + if inputs.dtype in (jnp.int32, jnp.int64): + init_val = -(1 << 31) + init_val = jnp.array(init_val).astype(inputs.dtype) + + indices, y = jax.lax.reduce_window( + (indices, inputs), (0, init_val), reduce_fn, dims, strides, padding + ) + if is_single_input: + indices = jnp.squeeze(indices, axis=0) + y = jnp.squeeze(y, axis=0) + return y, indices + + batch_result = pool( + inputs, -jnp.inf, jax.lax.max, kernel_size, strides, padding + ) + indices = pool(inputs, 0, jnp.argmax, kernel_size, strides, padding) + return batch_result, indices + + +# TODO add more ops + + +@op(torch.ops.aten.min) +def _aten_min(x, axis=None): + return jnp.min(x, axis=axis), jnp.argmin(x, axis=axis).astype(jnp.int64) + + +@op(torch.ops.aten.amin) +def _aten_amin(x, dim=None, keepdim=False): + return _with_reduction_scalar(jnp.amin, x, dim, keepdim) + + +@op(torch.ops.aten.argmin) +def _aten_argmin(self, dim=None, keepdim=False): + return _with_reduction_scalar(jnp.argmin, self, dim, keepdim) + + +@op(torch.ops.aten.sin) +def _aten_sin(x): + return jnp.sin(x) + + +@op(torch.ops.aten.sym_size) +def _aten_sym_size(x, dim): + return x.shape[dim] + + +@op(torch.ops.aten.var.correction) +@op(torch.ops.prims.var) +def _aten_var(x, dim=None, *, correction=1, keepdim=False, out=None): + return jnp.var(x, axis=dim, ddof=correction, keepdims=keepdim) + + +@op(torch.ops.prims.broadcast_in_dim) +def _prims_broadcast_in_dim(t, shape, broadcast_dimensions): + return jax.lax.broadcast_in_dim( + t, shape, broadcast_dimensions=broadcast_dimensions + ) + + +# aten.native_group_norm -- should use decomp table +# func: native_group_norm(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps) -> (Tensor, Tensor, Tensor) + + +@op(torch.ops.aten.native_group_norm) +def _aten_native_group_norm(input, weight, bias, N, C, HxW, group, eps=1e-5): + """Group Normalization implementation in JAX. + + Args: + input: Input tensor. Expected shape (batch_size, channels, ... spatial dims + ...) + weight: Optional scaling (gamma) parameter. Shape (channels,) + bias: Optional shifting (beta) parameter. Shape (channels,) + N: Batch size. + C: Number of channels. + HxW: Product of spatial dimensions (number of elements per channel after + flattening). + group: Number of groups for Group Normalization. + eps: Small value added for numerical stability. + + Returns: + A tuple of (normalized_output, mean, rstd) + """ + + input_shape = input.shape + + # Reshape for group-wise normalization + reshaped_input = jnp.reshape(input, (1, N * group, -1)) + + # **Core Group Normalization** + def group_norm_body(x): # Function to apply within each group + mean = jnp.mean(x, axis=-1, keepdims=True) + var = jnp.var(x, axis=-1, keepdims=True) + rstd = jax.lax.rsqrt(var + eps) # Reciprocal of std with epsilon + normalized = (x - mean) * rstd + return normalized, mean, rstd + + normalized, group_mean, group_rstd = jax.lax.map( + group_norm_body, reshaped_input + ) + + # Reshape back to original input shape + output = jnp.reshape(normalized, input_shape) + + # **Affine transformation** + affine_shape = [ + -1 if i == 1 else 1 for i in range(input.ndim) + ] # Shape for broadcasting + if weight is not None and bias is not None: + output = bias.reshape(affine_shape) + output * weight.reshape(affine_shape) + elif weight is not None: + output = output * weight.reshape(affine_shape) + elif bias is not None: + output = output + bias.reshape(affine_shape) + + # Reshape mean and rstd + mean = jnp.reshape(group_mean, (N, group)) + rstd = jnp.reshape(group_rstd, (N, group)) + + return output, mean, rstd + + +@op(torch.ops.aten.linalg_vector_norm) +def _aten_linalg_vector_norm(self, ord=2, dim=None, keepdim=False, dtype=None): + """Calculates the vector norm along specified dimensions. + + Args: + self: The input tensor. + ord: The order of the norm. Can be a float or 'inf', '-inf', 'fro'. + Default is 2 (Euclidean norm). + dim: Dimensions along which to calculate the norm. If None, the norm is + calculated over all dimensions. + keepdim: Whether to keep the reduced dimensions. + dtype: Optional data type for the output. + + Returns: + The tensor containing the calculated vector norms. + """ + + if ord not in {2, float("inf"), float("-inf"), "fro"}: + raise ValueError( + f"Unsupported ord value: {ord}. Supported values are 2, inf, -inf, and" + " 'fro'." + ) + + # Special cases (for efficiency and clarity) + if ord == 2: # Euclidean norm + result = jnp.sqrt(jnp.sum(jnp.abs(self) ** 2, axis=dim, keepdims=keepdim)) + + elif ord == float("inf"): + result = jnp.max(jnp.abs(self), axis=dim, keepdims=keepdim) + + elif ord == float("-inf"): + result = jnp.min(jnp.abs(self), axis=dim, keepdims=keepdim) + + elif ord == "fro": # Frobenius norm + result = jnp.sqrt(jnp.sum(jnp.abs(self) ** 2, axis=dim, keepdims=keepdim)) + + else: # General case (e.g., ord = 1, ord = 3) + result = jnp.sum(jnp.abs(self) ** ord, axis=dim, keepdims=keepdim) ** ( + 1.0 / ord + ) + + # (Optional) dtype conversion + if dtype is not None: + result = result.astype(dtype) + + return result + + +# aten.reflection_pad1d +@op(torch.ops.aten.reflection_pad1d) +def _aten_reflection_pad1d(input, padding): + rank = len(input.shape) + pad_size = [(0, 0)] * rank + pad_size[-1] = padding + return jnp.pad(input, pad_size, mode="reflect") + + +# aten.alias +@op(torch.ops.aten.alias) +def _aten_alias(self, *args): + return self + + +# aten.sinh +@op(torch.ops.aten.sinh) +def _aten_sinh(self): + return jnp.sinh(self) + + +# aten.native_layer_norm_backward +@op(torch.ops.aten.native_layer_norm_backward) +def _aten_native_layer_norm_backward( + grad_out, input, normalized_shape, weight, bias, eps=1e-5 +): + """Implements the backward pass of layer normalization in Jax as defined by `aten::native_layer_norm_backward`. + + Args: + grad_out: The gradient of the output tensor. + input: The input tensor. + normalized_shape: A list of integer dimensions to be normalized over. + weight: Optional weight tensor for the affine transformation. + bias: Optional bias tensor for the affine transformation. + eps: A small epsilon value for numerical stability. + + Returns: + A tuple of (grad_input, grad_weight, grad_bias). + """ + return jax.lax.native_layer_norm_backward( + grad_out, input, normalized_shape, weight, bias, eps + ) + + +# aten.reflection_pad3d_backward +# aten.reflection_pad2d + + +# aten.atanh +@op(torch.ops.aten.atanh) +def _aten_atanh(self): + return jnp.arctanh(self) + + +# aten.bitwise_not +@op(torch.ops.aten.bitwise_not) +def _aten_bitwise_not(self): + return ~self + + +# aten.embedding_dense_backward + + +# aten.sum +@op(torch.ops.aten.sum) +def _aten_sum(self, dim=None, keepdim=False, dtype=None): + if not dim: + dim = None + return jnp.sum(self, axis=dim, keepdims=keepdim, dtype=dtype) + + +# aten.sqrt +@op(torch.ops.aten.sqrt) +def _aten_sqrt(self): + return jnp.sqrt(self) + + +@op(torch.ops.aten.tan) +def _aten_tanh(self): + return jnp.tan(self) + + +# aten.tanh +@op(torch.ops.aten.tanh) +def _aten_tanh(self): + return jnp.tanh(self) + + +# aten.ceil +@op(torch.ops.aten.ceil) +def _aten_ceil(self): + return jnp.ceil(self) + + +# aten.asin +@op(torch.ops.aten.asin) +def _aten_asin(self): + return jnp.arcsin(self) + + +# aten.minimum +@op(torch.ops.aten.minimum) +def _aten_minimum(self, other): + return jnp.minimum(self, other) + + +# aten.max_pool2d_backward + + +def _scatter_index(dim, index): + """Returns a tuple of indexes; + + The first is to select in input (to modify), + the second is to select from the values. + """ + index_shape = list(index.shape) + input_indexes = [] + source_indexes = [] + for i in range(len(index_shape)): + source_indexes.append(slice(0, index_shape[i])) + if i == dim: + input_indexes.append(index) + else: + target_shape = [1] * len(index_shape) + target_shape[i] = index_shape[i] + input_indexes.append( + jnp.broadcast_to( + jnp.arange(index_shape[i]).reshape(target_shape), index_shape + ) + ) + return tuple(input_indexes), tuple(source_indexes) + + +# aten.scatter_add +@op(torch.ops.aten.scatter_add) +def _aten_scatter_add(input, dim, index, src): + """JAX implementation of scatter, mimicking torch.scatter behavior""" + + input_indexes, source_indexes = _scatter_index(dim, index) + return input.at[input_indexes].add(src[source_indexes]) + + +# aten.logical_not + + +# aten.sign +@op(torch.ops.aten.sign) +def _aten_sign(x): + return jnp.sign(x) + + +# aten.sigmoid +@op(torch.ops.aten.sigmoid) +def _aten_sigmoid(x): + if x.dtype in (jnp.int32, jnp.int64): + x = x.astype(jnp.float32) + return jax.nn.sigmoid(x) + + +# implement aten.asinh in jax +@op(torch.ops.aten.asinh) +def _aten_asinh(self): + return jnp.arcsinh(self) + + +# aten.atan +@op(torch.ops.aten.atan) +def _aten_atan(self): + return jnp.arctan(self) + + +# aten.scatter_reduce +@op(torch.ops.aten.scatter_reduce) +def _aten_scatter_reduce(input, dim, index, src, reduce, *, include_self=True): + input_indexes, source_indexes = _scatter_index(dim, index) + if reduce == "sum": + return input.at[input_indexes].add(src[source_indexes]) + elif reduce == "prod": + return input.at[input_indexes].multiply(src[source_indexes]) + elif reduce == "mean": + return input.at[input_indexes].add(src[source_indexes]) + elif reduce == "amax": + return input.at[input_indexes].max(src[source_indexes]) + elif reduce == "amin": + return input.at[input_indexes].min(src[source_indexes]) + else: + raise RuntimeError("Unknow reduction type: ", reduce) + + +# aten.acos +@op(torch.ops.aten.acos) +def _aten_acos(self): + return jnp.arccos(self) + + +# aten.sym_storage_offset +# aten.native_layer_norm_backward +# aten.max_pool3d_with_indices + + +# aten.gt +@op(torch.ops.aten.gt) +def _aten_gt(self, other): + return self > other + + +# aten.pixel_shuffle +@op(torch.ops.aten.pixel_shuffle) +def _aten_pixel_shuffle(x, upscale_factor): + """PixelShuffle implementation in JAX. + + Args: + x: Input tensor. Typically a feature map. + upscale_factor: Integer by which to upscale the spatial dimensions. + + Returns: + Tensor after PixelShuffle operation. + """ + + batch_size, channels, height, width = x.shape + + if channels % (upscale_factor**2) != 0: + raise ValueError( + "Number of channels must be divisible by the square of the upscale factor." + ) + + new_channels = channels // (upscale_factor**2) + new_height = height * upscale_factor + new_width = width * upscale_factor + + x = x.reshape( + batch_size, new_channels, upscale_factor, upscale_factor, height, width + ) + x = jnp.transpose( + x, (0, 1, 2, 4, 3, 5) + ) # Move channels to spatial dimensions + x = x.reshape(batch_size, new_channels, new_height, new_width) + + return x + + +# aten.sym_stride +# aten.lt +@op(torch.ops.aten.lt) +def _aten_lt(self, other): + return self < other + + +def pool(inputs, init, reduce_fn, window_shape, strides, padding): + """Helper function to define pooling functions. + + Pooling functions are implemented using the ReduceWindow XLA op. + NOTE: Be aware that pooling is not generally differentiable. + That means providing a reduce_fn that is differentiable does not imply that + pool is differentiable. + + Args: + inputs: input data with dimensions (batch, window dims..., features). + init: the initial value for the reduction + reduce_fn: a reduce function of the form ``(T, T) -> T``. + window_shape: a shape tuple defining the window to reduce over. + strides: a sequence of ``n`` integers, representing the inter-window + strides (default: ``(1, ..., 1)``). + padding: either the string ``'SAME'``, the string ``'VALID'``, or a sequence + of ``n`` ``(low, high)`` integer pairs that give the padding to apply before + and after each spatial dimension. + Returns: + The output of the reduction for each window slice. + """ + num_batch_dims = inputs.ndim - (len(window_shape) + 1) + strides = strides or (1,) * len(window_shape) + assert len(window_shape) == len( + strides + ), f"len({window_shape}) must equal len({strides})" + strides = (1,) * (1 + num_batch_dims) + strides + dims = (1,) * (1 + num_batch_dims) + window_shape + + is_single_input = False + if num_batch_dims == 0: + # add singleton batch dimension because lax.reduce_window always + # needs a batch dimension. + inputs = inputs[None] + strides = (1,) + strides + dims = (1,) + dims + is_single_input = True + + assert inputs.ndim == len(dims), f"len({inputs.shape}) != len({dims})" + if not isinstance(padding, str): + padding = tuple(map(tuple, padding)) + assert len(padding) == len(window_shape), ( + f"padding {padding} must specify pads for same number of dims as " + f"window_shape {window_shape}" + ) + assert all( + [len(x) == 2 for x in padding] + ), f"each entry in padding {padding} must be length 2" + padding = ((0, 0), (0, 0)) + padding + y = jax.lax.reduce_window(inputs, init, reduce_fn, dims, strides, padding) + if is_single_input: + y = jnp.squeeze(y, axis=0) + return y + + +@op(torch.ops.aten._adaptive_avg_pool3d) +def _aten_adaptive_avg_pool3d(x, output_shape): + return _aten_adaptive_avg_pool(x, output_shape, 3) + + +@op(torch.ops.aten._adaptive_avg_pool2d) +def _aten_adaptive_avg_pool3d(x, output_shape): + return _aten_adaptive_avg_pool(x, output_shape, 2) + + +def _aten_adaptive_avg_pool(x, output_shape, pool_dim): + def adaptive_kernel_size(input_shape, output_shape): + sizes = [1, 1] + spatial_dim_off = len(input_shape) - pool_dim + for spatial_dim in range(pool_dim): + sizes.append( + input_shape[spatial_dim_off + spatial_dim] // output_shape[spatial_dim] + ) + return tuple(sizes) + + kernel_sizes = adaptive_kernel_size(x.shape, output_shape) + y = pool(x, 0.0, jax.lax.add, kernel_sizes, kernel_sizes, padding="VALID") + + div_shape = list(x.shape) + num_batch_dims = len(x.shape) - pool_dim - 1 + div_shape[num_batch_dims] = 1 + div_shape = tuple(div_shape) + if len(div_shape) - 2 == len(kernel_sizes): + div_shape = (1,) + div_shape[1:] + y = y / pool( + jnp.ones(div_shape), 0.0, jax.lax.add, kernel_sizes, kernel_sizes, "VALID" + ) + return y + + +# aten.avg_pool2d +@op(torch.ops.aten.avg_pool2d) +@op(torch.ops.aten.avg_pool3d) +def _aten_avg_pool( + inputs, + kernel_size, + strides=None, + padding=0, + ceil_mode=False, + count_include_pad=True, + divisor_override=None, +): + num_batch_dims = len(inputs.shape) - len(kernel_size) - 1 + kernel_size = tuple(kernel_size) + strides = tuple(strides) + if isinstance(padding, int): + padding = tuple((padding, padding) for _ in range(len(kernel_size))) + elif isinstance(padding, list): + padding = tuple((p, p) for p in padding) + + y = pool(inputs, 0.0, jax.lax.add, kernel_size, strides, padding) + if count_include_pad: + y = y / np.prod(kernel_size) + else: + div_shape = list(inputs.shape) + div_shape[num_batch_dims] = 1 + div_shape = tuple(div_shape) + if len(div_shape) - 2 == len(kernel_size): + div_shape = (1,) + div_shape[1:] + y = y / pool( + jnp.ones(div_shape), 0.0, jax.lax.add, kernel_size, strides, padding + ) + return y + + +# aten.sym_numel +# aten.reciprocal +@op(torch.ops.aten.reciprocal) +def _aten_reciprocal(a): + return 1 / a + + +# aten.scatter +@op(torch.ops.aten.select_scatter) +def _aten_select_scatter(input, src, dim, index): + input_indexes = [] + for x in range(len(input.shape)): + if x == dim: + input_indexes.append(index) + else: + input_indexes.append(slice(None, None, None)) + return input.at[tuple(input_indexes)].set(src) + + +@op(torch.ops.aten.scatter.src) +def _aten_scatter_src(input, dim, index, src, reduce=None): + input_index, source_indexes = _scatter_index(dim, index) + return input.at[input_index].set(src[source_indexes]) + + +@op(torch.ops.aten.scatter.value) +def _aten_scatter(input, dim, index, src, reduce=None): + input_index, source_indexes = _scatter_index(dim, index) + return input.at[input_index].set(src) + + +# aten.acosh +@op(torch.ops.aten.acosh) +def _aten_acosh(self): + return jnp.arccosh(self) + + +# aten.avg_pool2d_backward +# aten.col2im +# aten.avg_pool3d +# aten.round +@op(torch.ops.aten.round) +def _aten_round(input, decimals=0): + return jnp.round(input, decimals) + + +# aten.max +@op(torch.ops.aten.max) +def _aten_max(self, dim=None, keepdim=False): + return jnp.max(self, axis=dim, keepdims=keepdim), jnp.argmax( + self, axis=dim, keepdims=keepdim + ) + + +# aten.maximum +@op(torch.ops.aten.maximum) +def _aten_maximum(self, other): + return jnp.maximum(self, other) + + +# aten.abs +@op(torch.ops.aten.abs) +def _aten_abs(self): + return jnp.abs(self) + + +# generate aten.amax only +@op(torch.ops.aten.amax) +def _aten_amax(self, dim=None, keepdim=False): + return _with_reduction_scalar(jnp.amax, self, dim, keepdim) + + +def _with_reduction_scalar(jax_func, self, dim, keepdim): + expanded = False + if self.ndim == 0: + # for self of rank 0: + # torch.any(x, 0), torch.any(x, -1) works; + # torch.any(x, 1) throws out of bounds, so it's + # behavior is the same as a jnp array of rank 1 + expanded = True + self = jnp.expand_dims(self, 0) + res = jax_func(self, axis=dim, keepdims=keepdim) + if expanded: + res = res.squeeze() + return res + + +# aten.any +@op(torch.ops.aten.any) +def _aten_any(self, dim=None, keepdim=False): + return _with_reduction_scalar(jnp.any, self, dim, keepdim) + + +# aten.arange +@op(torch.ops.aten.arange.start_step) +@op(torch.ops.aten.arange.start) +@op(torch.ops.aten.arange.default) +def _aten_arange( + start, + end=None, + step=1, + *, + dtype=None, + layout=None, + requires_grad=False, + device=None, + pin_memory=False, +): + if end is None: + end = start + start = 0 + if dtype: + dtype = tensor.t2j_dtype(dtype) + return jnp.arange( + start, + end, + step, + dtype=dtype, + ) + + +# aten.argmax +@op(torch.ops.aten.argmax) +def _aten_argmax(self, dim=None, keepdim=False): + return _with_reduction_scalar(jnp.argmax, self, dim, keepdim) + + +# aten.as_strided +@op(torch.ops.aten.as_strided) +@op(torch.ops.aten.as_strided_copy) +def _aten_as_strided(x, sizes, strides, storage_offset=None): + ind = jnp.zeros(sizes, dtype=jnp.int32) + + for i, (size, stride) in enumerate(zip(sizes, strides)): + result_shape = (1,) * i + (size,) + (1,) * (len(sizes) - i - 1) + indexes = (jnp.arange(size) * stride).reshape(result_shape) + ind += indexes + + return jnp.ravel(x)[ind] + + +# aten.atan2 +@op(torch.ops.aten.atan2) +def _aten_atan2(self, other): + return jnp.arctan2(self, other) + + +# aten.bitwise_and +@op(torch.ops.aten.bitwise_and) +def _aten_bitwise_and(self, other): + return self & other + + +# aten.bitwise_or +@op(torch.ops.aten.bitwise_or) +def _aten_bitwise_or(self, other): + return self | other + + +# aten.bitwise_xor +@op(torch.ops.aten.bitwise_xor) +def _aten_bitwise_xor(self, other): + return self ^ other + + +# aten.clamp +@op(torch.ops.aten.clamp.default) +@op(torch.ops.aten.clamp.Tensor) +def _aten_clamp(self, min=None, max=None): + return jnp.clip(self, min, max) + + +# aten.constant_pad_nd +@op(torch.ops.aten.constant_pad_nd) +def _aten_constant_pad_nd(input, padding, value=0): + # NOTE: Torch padding is flat and reversed: (1, 1, 2, 2) + # means last dim get padded 1 in front and 1 in back; + # and second last dim get padded 2 in front and 2 in back. + # Jax padding tuple of 2-tuple: the same padding is + # [(0, 0), ..., (2,2), (1,1)] + m = len(padding) + rev_padding = [(padding[i - 1], padding[i]) for i in range(m - 1, 0, -2)] + pad_dim = tuple(([(0, 0)] * (len(input.shape) - m // 2)) + rev_padding) + return jnp.pad(input, pad_dim, mode="constant", constant_values=value) + + +# aten.convolution_backward +@op(torch.ops.aten.copy) +@op(torch.ops.aten.lift_fresh_copy) +def _aten_copy(x): + return jnp.copy(x) + + +@op(torch.ops.aten._cdist_forward) +def _aten_cdist_forward(x1, x2, p, compute_mode=""): + # x1 is B x P x M + # x2 is B x Q x M + # res is B x P x Q + x1 = jnp.expand_dims(x1, len(x1.shape) - 1) + x2 = jnp.expand_dims(x2, len(x2.shape) - 2) + return jnp.linalg.norm(x1 - x2, ord=p, axis=-1) + + +@op(torch.ops.aten._pdist_forward) +def _aten__pdist_forward(x, p): + pairwise_dists = _aten_cdist_forward(x, x, p) + condensed_dists = pairwise_dists[ + jnp.triu_indices(pairwise_dists.shape[0], k=1) + ] + return condensed_dists + + +# aten.cos +@op(torch.ops.aten.cos) +def _aten_cos(input): + return jnp.cos(input) + + +# aten.cosh +@op(torch.ops.aten.cosh) +def _aten_cosh(input): + return jnp.cosh(input) + + +# aten.diagonal +@op(torch.ops.aten.diagonal) +def _aten_diagonal(input, offset=0, dim1=0, dim2=1): + return jnp.diagonal(input, offset, dim1, dim2) + + +# aten.empty_strided +# aten.eq +@op(torch.ops.aten.eq) +def _aten_eq(input1, input2): + return input1 == input2 + + +# aten.erf +@op(torch.ops.aten.erf) +def _aten_erf(x): + if x.dtype in (jnp.int32, jnp.int64): + x = x.astype(jnp.float32) + return jax.lax.erf(x) + + +# aten.exp +@op(torch.ops.aten.exp) +def _aten_exp(input): + return jnp.exp(input) + + +# aten.expm1 +@op(torch.ops.aten.expm1) +def _aten_expm1(input): + return jnp.expm1(input) + + +# aten.fill +@op(torch.ops.aten.fill) +@op(torch.ops.aten.full_like) +def _aten_fill(x, value, dtype=None, pin_memory=None, memory_format=None): + if dtype is None: + dtype = x.dtype + else: + dtype = tensor.t2j_dtype(dtype) + return jnp.full(x.shape, value, dtype) + + +# aten.flip +@op(torch.ops.aten.flip) +def _aten_flip(input, dims): + if dims is not None: + return jnp.flip(input, tuple(dims)) + else: + return jnp.flip(input) + + +# aten.floor +@op(torch.ops.aten.floor) +def _aten_floor(input): + return jnp.floor(input) + + +# aten.fmod +@op(torch.ops.aten.fmod) +def _aten_fmod(input, other): + return input - other * _aten_div(input, other, "trunc") + + +# aten.gather +@op(torch.ops.aten.gather) +def _aten_gather(input, dim, index): + input_indexes, source_indexes = _scatter_index(dim, index) + return input[input_indexes] + + +# aten.ge +@op(torch.ops.aten.ge) +def _aten_ge(self, other): + return self >= other + + +@op(torch.ops.aten.glu) +@op(torch.ops.aten.glu.default) +def _aten_glu(x, dim=-1): + return jax.nn.glu(x, dim) + + +# aten.hardtanh +@op(torch.ops.aten.hardtanh) +def _aten_hardtanh(input, min_val=-1.0, max_val=1.0, inplace=False): + return jnp.clip(input, min_val, max_val) + + +# aten.isinf +@op(torch.ops.aten.isinf) +def _aten_isinf(input): + return jnp.isinf(input) + + +# aten.isnan +@op(torch.ops.aten.isnan) +def _aten_isnan(input): + return jnp.isnan(input) + + +@op(torch.ops.aten.le) +def _aten_le(self, other): + return self <= other + + +# aten.leaky_relu +@op(torch.ops.aten.leaky_relu) +def _aten_leaky_relu(x, negative_slope): + return jax.nn.leaky_relu(x, negative_slope) + + +# aten.log +@op(torch.ops.aten.log) +def _aten_log(x): + return jnp.log(x) + + +# aten.log10 +@op(torch.ops.aten.log10) +def _aten_log10(x): + return jnp.log10(x) + + +# aten.log1p +@op(torch.ops.aten.log1p) +def _aten_log1p(x): + return jnp.log1p(x) + + +# aten.log2 +@op(torch.ops.aten.log2) +def _aten_log2(x): + return jnp.log2(x) + + +# aten.logical_and +@op(torch.ops.aten.logical_and) +def _aten_logical_and(self, other): + return jnp.logical_and(self, other) + + +# aten.logical_or +@op(torch.ops.aten.logical_or) +def _aten_logical_or(self, other): + return jnp.logical_or(self, other) + + +# aten.logical_not +@op(torch.ops.aten.logical_not) +def _aten_logical_not(self): + return jnp.logical_not(self) + + +# aten.log_softmax +@op(torch.ops.aten._log_softmax) +def _aten_log_softmax(self, axis=-1, half_to_float=False): + return jax.nn.log_softmax(self, axis) + + +# aten.max_pool3d_backward +# aten.logical_xor +@op(torch.ops.aten.logical_xor) +def _aten_logical_xor(self, other): + return jnp.logical_xor(self, other) + + +# aten.max_pool2d_with_indices_backward +# aten.native_dropout +# aten.native_group_norm_backward +# aten.neg +@op(torch.ops.aten.neg) +def _aten_neg(x): + return -1 * x + + +# aten.nonzero +@op(torch.ops.aten.nonzero) +def _aten_nonzero(x): + index_tuple = jnp.nonzero(x) + index_tuple = [jnp.expand_dims(p, -1) for p in index_tuple] + return jnp.concatenate(index_tuple, axis=-1) + + +# aten.prod + + +@op(torch.ops.aten.prod) +def _aten_prod(self, dim=None, keepdim=False): + return jnp.prod(self, axis=dim, keepdims=keepdim) + + +# aten.randperm + + +# aten.reflection_pad3d + + +# aten.remainder +@op(torch.ops.aten.remainder) +def _aten_remainder(inputs, other): + return inputs % other + + +# aten.repeat +@op(torch.ops.aten.repeat) +def _aten_repeat(x, reps): + return jnp.tile(x, reps) + + +# aten.replication_pad2d +# aten.replication_pad3d +# aten.roll +@op(torch.ops.aten.roll) +def _aten_roll(input, shifts, dims=None): + return jnp.roll(input, shifts, dims) + + +# aten.scalar_tensor +# aten.slice_scatter +@op(torch.ops.aten.slice_scatter) +def _aten_slice_scatter(input, src, dim=0, start=None, end=None, step=1): + input_index = [] + for x in range(len(input.shape)): + if x == dim: + input_index.append(slice(start, end, step)) + else: + input_index.append(slice(None, None, None)) + return input.at[tuple(input_index)].set(src) + + +# aten.sort +# torch.sort(input, dim=-1, descending=False, stable=False, *, out=None) +@op(torch.ops.aten.sort) +def _aten_sort(a, dim=-1, descending=False, stable=False): + return ( + jnp.sort(a, axis=dim, stable=stable, descending=descending), + jnp.argsort(a, axis=dim, stable=stable, descending=descending), + ) + + +# aten.sym_size + + +# aten.topk +@op(torch.ops.aten.topk) +def _aten_topk(input, k, dim=None, largest=True, sorted=True, *, out=None): + """JAX top-k implementation using jax.lax.top_k for improved efficiency. + + Args: + input: The input JAX array. + k: The number of top elements to return. + dim: The dimension along which to find the top-k. If None, operates on the + flattened array. + largest: If True, returns the largest k elements. Otherwise, smallest k. + sorted: If True, returns the elements in sorted order. + + Returns: + A tuple (values, indices) containing: + - values: The top k values. + - indices: The indices of the top k values in the original array. + """ + if dim is None: + input = input.flatten() + dim = 0 + + if not largest: + input = -input # Find top-k of negated input if we want the smallest + + transpose_shape = None + if dim != -1 and dim != len(input.shape) - 1: + transpose_shape = list(range(len(input.shape))) + transpose_shape[dim], transpose_shape[-1] = ( + transpose_shape[-1], + transpose_shape[dim], + ) + input = jnp.transpose(input, transpose_shape) + + values, indices = jax.lax.top_k(input, k) + + if sorted: + values = jnp.sort(values, descending=True) + indices = jnp.take_along_axis( + indices, jnp.argsort(values, axis=-1, descending=True), axis=-1 + ) + + if not largest: + values = -values # Negate values back if we found smallest + + if transpose_shape is not None: + values = jnp.transpose(values, transpose_shape) + indices = jnp.transpose(indices, transpose_shape) + + return values, indices + + +# aten.trunc +@op(torch.ops.aten.trunc) +def _aten_trunc(a): + return jnp.trunc(a) + + +@op(torch.ops.aten.unbind) +@op(torch.ops.aten.unbind_copy) +def _aten_unbind(a, dim=0): + return tuple( + _aten_squeeze_dim(jax.lax.index_in_dim(a, i, axis=dim), dim) + for i in range(a.shape[dim]) + ) + + +# NOTE: skip aten.upsample_nearest2d and aten.upsample_bilinear2d +# despite those being core aten ops, they also have decompositions. +# here we are using torch decompositions. + + +# aten.where +@op(torch.ops.aten.where.self) +@op(torch.ops.aten.where.ScalarSelf) +@op(torch.ops.aten.where.ScalarOther) +def _aten_where(condition, x, y): + return jnp.where(condition, x, y) + + +# aten.to.dtype +# Tensor(a) self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None +@op(torch.ops.aten.to.dtype) +def _aten_to_dtype( + a, dtype, non_blocking=False, copy=False, memory_format=None +): + if dtype: + jaxdtype = tensor.t2j_dtype(dtype) + return a.astype(jaxdtype) + + +# aten.to.device + + +# Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False +@op(torch.ops.aten.var_mean.correction) +def _aten_var_mean_correction(self, dim=None, correction=None, keepdim=False): + return ( + jnp.var(self, axis=dim, ddof=correction, keepdims=keepdim), + jnp.mean(self, dim, keepdims=keepdim), + ) + + +@op(torch.ops.aten.scalar_tensor) +def _aten_scalar_tensor( + s, dtype=None, layout=None, device=None, pin_memory=None +): + if dtype is not None: + dtype = tensor.t2j_dtype(dtype) + return jnp.array(s, dtype=dtype) + return jnp.array(s) + + +@op(torch.ops.aten.to.device) +def _aten_to_device(x, device, dtype): + return x + + +@op(torch.ops.aten.max_pool2d_with_indices_backward) +def max_pool2d_with_indices_backward_custom( + grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices +): + """ + Approximates the gradient calculation of PyTorch's max_pool2d_with_indices_backward. + + Args: + grad_output: The gradient tensor from the preceding layer. + self: The input tensor on which the original max pooling was performed. + kernel_size: The size of the pooling window. + stride: The stride of the pooling window. + padding: The padding applied during max pooling. + dilation: The dilation factor for the pooling operation. + ceil_mode: Whether to use ceil or floor when calculating output shapes. + indices: The indices of the maximum values, as produced by max_pool2d_with_indices. + + Returns: + The calculated gradient with respect to the input (grad_input). + """ + + kH, kW = kernel_size + dH, dW = stride + padH, padW = padding + dilH, dilW = dilation + + # Calculate output shape (may need adjustment based on ceil_mode) + out_shape = jnp.array(self.shape) + grad_input = jnp.zeros_like(self) + + # Iterate over the flattened input and output tensors + for i, idx in enumerate(indices.flatten()): + # Calculate input coordinates corresponding to the maximum value + out_y, out_x = i // grad_output.shape[3], i % grad_output.shape[3] + in_y = out_y * dH - padH + out_y * (dilH - 1) + in_x = out_x * dW - padW + out_x * (dilW - 1) + + # Scatter the gradient to the appropriate input locations (handling potential overlaps) + for y in range(in_y, in_y + kH): + for x in range(in_x, in_x + kW): + if 0 <= y < grad_input.shape[2] and 0 <= x < grad_input.shape[3]: + grad_input = grad_input.at[y, x].add(grad_output.flatten()[i]) + + return grad_input + + +@op(torch.ops.aten._local_scalar_dense) +def _aten_local_scalar_dense(x): + return x.item() + + +@op(torch.ops.aten.tensor_split.sections) +def _aten_tensor_split(ary, indices_or_sections, axis=0): + return jnp.array_split(ary, indices_or_sections, axis) + + +@op(torch.ops.aten.randn, needs_env=True) +def _randn( + *size, + generator=None, + out=None, + dtype=None, + layout=torch.strided, + device=None, + requires_grad=False, + pin_memory=False, + env=None, +): + shape = size + if len(shape) == 1 and isinstance(shape[0], (list, tuple)): + shape = shape[0] + key = env.get_and_rotate_prng_key() + res = jax.random.normal(key, shape) + if dtype is not None: + dtype = tensor.t2j_dtype(dtype) + res = res.astype(dtype) + return res + + +@op(torch.ops.aten.rand, needs_env=True) +def _rand( + *size, + generator=None, + out=None, + dtype=None, + layout=torch.strided, + device=None, + requires_grad=False, + pin_memory=False, + env=None, +): + shape = size + if len(shape) == 1 and isinstance(shape[0], (list, tuple)): + shape = shape[0] + key = env.get_and_rotate_prng_key() + res = jax.random.uniform(key, shape) + if dtype is not None: + dtype = tensor.t2j_dtype(dtype) + res = res.astype(dtype) + return res + + +@op(torch.ops.aten.scalar_tensor.default) +def _aten_scalar_tensor(val, **kwargs): + p = torch.ops.aten.scalar_tensor(val) + return tensor.t2j(p) + + +@op(torch.ops.aten.to.device) +def _aten_to_device(x, device, dtype): + return x + + +@op(torch.ops.aten.max_pool2d_with_indices_backward) +def max_pool2d_with_indices_backward_custom( + grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices +): + """ + Approximates the gradient calculation of PyTorch's max_pool2d_with_indices_backward. + + Args: + grad_output: The gradient tensor from the preceding layer. + self: The input tensor on which the original max pooling was performed. + kernel_size: The size of the pooling window. + stride: The stride of the pooling window. + padding: The padding applied during max pooling. + dilation: The dilation factor for the pooling operation. + ceil_mode: Whether to use ceil or floor when calculating output shapes. + indices: The indices of the maximum values, as produced by max_pool2d_with_indices. + + Returns: + The calculated gradient with respect to the input (grad_input). + """ + + kH, kW = kernel_size + dH, dW = stride + padH, padW = padding + dilH, dilW = dilation + + # Calculate output shape (may need adjustment based on ceil_mode) + out_shape = jnp.array(self.shape) + grad_input = jnp.zeros_like(self) + + # Iterate over the flattened input and output tensors + for i, idx in enumerate(indices.flatten()): + # Calculate input coordinates corresponding to the maximum value + out_y, out_x = i // grad_output.shape[3], i % grad_output.shape[3] + in_y = out_y * dH - padH + out_y * (dilH - 1) + in_x = out_x * dW - padW + out_x * (dilW - 1) + + # Scatter the gradient to the appropriate input locations (handling potential overlaps) + for y in range(in_y, in_y + kH): + for x in range(in_x, in_x + kW): + if 0 <= y < grad_input.shape[2] and 0 <= x < grad_input.shape[3]: + grad_input = grad_input.at[y, x].add(grad_output.flatten()[i]) + + return grad_input + + +@op(torch.ops.aten._local_scalar_dense) +def _aten_local_scalar_dense(x): + return x.item() + + +@op(torch.ops.aten.tensor_split.sections) +def _aten_tensor_split(ary, indices_or_sections, axis=0): + return jnp.array_split(ary, indices_or_sections, axis) + + +@op(torch.ops.aten.outer) +def _aten_outer(a, b): + return jnp.outer(a, b) + + +@op(torch.ops.aten.allclose) +def _aten_allclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False): + return jnp.allclose(input, other, rtol, atol, equal_nan) + diff --git a/experimental/torch_xla2/torch_xla2/ops/jtorch.py b/experimental/torch_xla2/torch_xla2/ops/jtorch.py index 6628b7e9510..ddc04fa4b1b 100644 --- a/experimental/torch_xla2/torch_xla2/ops/jtorch.py +++ b/experimental/torch_xla2/torch_xla2/ops/jtorch.py @@ -1,7 +1,116 @@ +"""Tensor constructor overrides""" +import functools +from typing import Callable, Optional, ParamSpec, Sequence + +import jax import torch +import jax.numpy as jnp +from torch_xla2 import tensor +from torch_xla2.ops.ops_registry import register_torch_function_op + +def register_function(torch_func, **kwargs): + return functools.partial(register_torch_function_op, torch_func, **kwargs) + + +P = ParamSpec('P') + + +def convert_dtype(use_default_dtype: bool = True): + """Converts `dtype` kwarg of function from torch to JAX. + + Args: + use_default_dtype: Whether to use torch default dtype if none is provided. + + Returns: + A decorator that wraps a JAX implementation of a torch function. + """ + + def decorator(func: Callable[P, torch.Tensor]): + + @functools.wraps(func) + def wrapper(*args: P.args, + dtype: Optional[torch.dtype] = None, + **kwargs: P.kwargs): + if not dtype and use_default_dtype: + dtype = torch.get_default_dtype() + jax_dtype = tensor.t2j_dtype(dtype) + + return func(*args, dtype=jax_dtype, **kwargs) + + return wrapper + + return decorator + + +@register_function(torch.tensor) +@convert_dtype(use_default_dtype=False) # Attempt to infer type from elements +def _tensor(data, *, dtype=None, **kwargs): + python_types_to_torch_types = { + bool: jnp.bool, + int: jnp.int64, + float: jnp.float32, + complex: jnp.complex64, + } + if not dtype: + leaves = jax.tree_util.tree_leaves(data) + if len(leaves) > 0: + dtype = python_types_to_torch_types.get(type(leaves[0])) + + return jnp.array( + data, dtype=dtype or tensor.t2j_dtype(torch.get_default_dtype())) + + +@register_function(torch.ones) +@convert_dtype() +def _ones(*size: int, dtype=None, **kwargs): + return jnp.ones(size, dtype) + + +@register_function(torch.zeros) +@convert_dtype() +def _zeros(*size: int, dtype=None, **kwargs): + return jnp.zeros(size, dtype) + + +@register_function(torch.eye) +@convert_dtype() +def _eye(n: int, m: Optional[int] = None, *, dtype=None, **kwargs): + return jnp.eye(n, m, dtype=dtype) + + +@register_function(torch.full) +@convert_dtype() +def _full(size: Sequence[int], fill_value, *, dtype=None, **kwargs): + # TODO: handle torch.Size + return jnp.full(size, fill_value, dtype=dtype) + + +@register_function(torch.allclose) +def _aten_allclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False): + return jnp.allclose(input, other, rtol, atol, equal_nan) + +@register_function(torch.angle) +def _torch_angle(input): + return jnp.angle(input) +@register_function(torch.argsort) +def _torch_argsort(input, dim=-1, descending=False, stable=False): + expanded = False + if input == 0: + # for self of rank 0: + # torch.any(x, 0), torch.any(x, -1) works; + # torch.any(x, 1) throws out of bounds, so it's + # behavior is the same as a jnp array of rank 1 + expanded = True + input = jnp.expand_dims(input, 0) + res = jnp.argsort(input, axis=dim, descending=descending, + stable=stable) + if expanded: + res = res.squeeze() + return res -torch_ops_override = { - torch.allclose: torch.ops.aten.allclose -} \ No newline at end of file +@register_function(torch.einsum) +def _einsum(equation, *operands): + assert isinstance(equation, str), 'Only accept str equation' + return jnp.einsum(equation, *operands) \ No newline at end of file diff --git a/experimental/torch_xla2/torch_xla2/ops/op_base.py b/experimental/torch_xla2/torch_xla2/ops/op_base.py index 62df160edc9..983d20fb660 100644 --- a/experimental/torch_xla2/torch_xla2/ops/op_base.py +++ b/experimental/torch_xla2/torch_xla2/ops/op_base.py @@ -1,22 +1,11 @@ import torch -from torch_xla2 import extra - -class JaxOperator: - """This is a aten op backed by jax function.""" - - def __init__(self, jax_callable): - self.jax = jax_callable - - def __call__(self, *args, **kwargs): - # args are torch.Tensor - res = call_jax(self.jax, args, kwargs) - return res +from torch_xla2 import interop class BinaryOpWithPromotion: - def __init__(self, jax_callable): - self.jax = jax_callable + def __init__(self, inner): + self.inner = inner def _get_dtype(self, obj): if isinstance(obj, torch.Tensor): @@ -31,7 +20,7 @@ def _get_dtype(self, obj): def __call__(self, *args, **kwargs): # args are torch.Tensor - res = extra.torch_view(self.jax)(*args, **kwargs) + res = interop.torch_view(self.jax)(*args, **kwargs) dtype = torch.promote_types( self._get_dtype(args[0]), @@ -41,15 +30,6 @@ def __call__(self, *args, **kwargs): return res -class TorchLowering: - - def __init__(self, lowering): - self.lowering = lowering - - def __call__(self, *args, **kwargs): - return self.lowering(*args, **kwargs) - - class InplaceOp: def __init__(self, functional_op, position_to_mutate=0): @@ -58,7 +38,7 @@ def __init__(self, functional_op, position_to_mutate=0): def __call__(self, *args, **kwargs): to_mutate = args[0] - to_mutate._elem = self.functional(*args, **kwargs)._elem + to_mutate.copy_(self.functional(*args, **kwargs)) return to_mutate diff --git a/experimental/torch_xla2/torch_xla2/ops/ops_registry.py b/experimental/torch_xla2/torch_xla2/ops/ops_registry.py new file mode 100644 index 00000000000..e75d1549456 --- /dev/null +++ b/experimental/torch_xla2/torch_xla2/ops/ops_registry.py @@ -0,0 +1,47 @@ +import dataclasses +from torch_xla2.types import JaxCallable, TorchCallable + +from typing import Union, Dict + + +@dataclasses.dataclass +class Operator: + torch_op: TorchCallable + func: Union[TorchCallable, JaxCallable] + is_jax_function: bool + is_user_defined: bool + needs_env: bool + + +all_aten_ops: Dict[TorchCallable, Operator] = {} +all_torch_functions: Dict[TorchCallable, Operator] = {} + + +def register_torch_dispatch_op( + aten_op, impl_callable, + is_jax_function=True, + is_user_defined=False, + needs_env=False, +): + op = Operator( + aten_op, impl_callable, + is_jax_function=is_jax_function, + is_user_defined=is_user_defined, + needs_env=needs_env) + all_aten_ops[aten_op] = op + return impl_callable + + +def register_torch_function_op( + torch_func, impl_callable, + is_jax_function=True, + is_user_defined=False, + needs_env=False, +): + op = Operator( + torch_func, impl_callable, + is_jax_function=is_jax_function, + is_user_defined=is_user_defined, + needs_env=needs_env) + all_torch_functions[torch_func] = op + return impl_callable \ No newline at end of file diff --git a/experimental/torch_xla2/torch_xla2/ops_registry.py b/experimental/torch_xla2/torch_xla2/ops_registry.py deleted file mode 100644 index f1d115864d3..00000000000 --- a/experimental/torch_xla2/torch_xla2/ops_registry.py +++ /dev/null @@ -1,74 +0,0 @@ -import torch -import torch._decomp as decomp -import torch_xla2.decompositions - -class LoweringRegistry: - - def __init__(self): - self.registered_ops = {} - self.decomps = {} - - def lookup(self, op_or_name): - candidate = self._lookup(op_or_name) - if candidate is None: - if isinstance(op_or_name, torch._ops.OpOverloadPacket): - candidate = self._lookup(op_or_name.default) - if isinstance(op_or_name, torch._ops.OpOverload): - candidate = self._lookup(op_or_name.overloadpacket) - return candidate - - def _lookup(self, op): - candidate = self.registered_ops.get(op) - if candidate is None: - candidate = self.decomp.get(op) - return candidate - - def register(self, op, lowering): - if isinstance(op, torch._ops.OpOverloadPacket): - if hasattr(op, 'default'): - self.registered_ops[op.default] = lowering - self.registered_ops[op] = lowering - - -lowerings = LoweringRegistry() -EXTRA_DECOMP = decomp.get_decompositions([ - torch.ops.aten.upsample_nearest2d, - torch.ops.aten._native_batch_norm_legit.no_stats, - torch.ops.aten._adaptive_avg_pool2d, - torch.ops.aten._adaptive_avg_pool3d, - torch.ops.aten.grid_sampler_2d, - torch.ops.aten.native_dropout, - torch.ops.aten.reflection_pad1d, - torch.ops.aten.reflection_pad2d, - torch.ops.aten.reflection_pad3d, - torch.ops.aten.replication_pad1d, - torch.ops.aten.replication_pad2d, - torch.ops.aten.replication_pad3d, -]) -CORE_ATEN_DECOMP = decomp.core_aten_decompositions() -CORE_ATEN_DECOMP.update(EXTRA_DECOMP) -lowerings.decomp = CORE_ATEN_DECOMP - - -def _all_core_ops(): - """Yields all core ops.""" - import torch._ops - - for k, v in torch.ops.aten.__dict__.items(): - if k.startswith('__'): - continue - if k.startswith('_'): - continue - if isinstance(v, torch._ops.OpOverloadPacket): - for overload in v.overloads(): - op = getattr(v, overload) - if torch.Tag.core in op.tags: - yield v - break - - -def print_missing_ops(): - core_aten = set(_all_core_ops()) - existing = set(lowerings.registered_ops.keys()) - for v in core_aten - existing: - print(v) diff --git a/experimental/torch_xla2/torch_xla2/tensor.py b/experimental/torch_xla2/torch_xla2/tensor.py index 98953a8b04c..262bc95f566 100644 --- a/experimental/torch_xla2/torch_xla2/tensor.py +++ b/experimental/torch_xla2/torch_xla2/tensor.py @@ -1,53 +1,16 @@ -import functools +import contextlib import jax from jax import dlpack as jaxdl import jax.numpy as jnp import numpy import torch import torch.func -import torch._decomp.decompositions -from torch_xla2 import ops_registry import torch.utils._python_dispatch as torch_dispatch import torch.utils._pytree as torch_pytree import torch.utils.dlpack as torchdl -from torch_xla2.ops import jaten -from torch._subclasses.fake_tensor import FakeTensorMode -fake_mode = FakeTensorMode() - - -class XLADispatchMode(torch_dispatch.TorchDispatchMode): - - def __torch_dispatch__(self, fn, types, args=(), kwargs=None): - if fn in constructors: - args, kwargs = unwrap((args, kwargs)) - res = constructors[fn](*args, **kwargs) - return wrap(res) - - return fn(*args, **kwargs) - - -def _aten_arange(start, - end, - *, - dtype=None, - layout=None, - requires_grad=False, - device=None, - pin_memory=False): - return jnp.arange(start, end, 1) - - -def _aten_scalar_tensor(val, **kwargs): - p = torch.ops.aten.scalar_tensor(val) - return wrap(t2j(p)) - - -constructors = { - torch.ops.aten.scalar_tensor.default: _aten_scalar_tensor, - torch.ops.aten.arange.default: functools.partial(_aten_arange, 0), - torch.ops.aten.arange.start: _aten_arange, -} +class OperatorNotFound(Exception): + pass def wrap(jaxarray): @@ -61,7 +24,9 @@ def unwrap(torchtensors): def t2j(t): if isinstance(t, XLATensor2): return t._elem + is_bool = False if t.dtype == torch.bool: + is_bool = True t = t.to(torch.int8) if not t.is_contiguous(): @@ -82,7 +47,7 @@ def t2j(t): if t.dtype == torch.bfloat16: res = res.astype(jnp.bfloat16) - if t.dtype == torch.bool: + if is_bool: res = res.astype(jnp.bool_) return res @@ -97,48 +62,41 @@ def j2t(x): res = res.to(torch.bool) return res +TORCH_DTYPE_TO_JAX = { + torch.float16: jnp.dtype('float16'), + torch.bfloat16: jnp.dtype('bfloat16'), + torch.half: jnp.dtype('float16'), + torch.float32: jnp.dtype('float32'), + torch.double: jnp.dtype('double'), + torch.long: jnp.dtype('int64'), + torch.int32: jnp.dtype('int32'), + torch.int16: jnp.dtype('int16'), + torch.int8: jnp.dtype('int8'), + torch.uint8: jnp.dtype('uint8'), + torch.bool: jnp.dtype('bool_'), + torch.complex64: jnp.dtype('complex64'), + torch.complex128: jnp.dtype('complex128'), + None: None, +} + +JAX_DTYPE_TO_TORCH = {value: key for key, value in TORCH_DTYPE_TO_JAX.items()} def t2j_dtype(dtype): - return { - torch.float16: jnp.float16, - torch.bfloat16: jnp.bfloat16, - torch.half: jnp.float16, - torch.float32: jnp.float32, - torch.double: jnp.double, - torch.long: jnp.int64, - torch.int32: jnp.int32, - torch.int16: jnp.int16, - torch.int8: jnp.int8, - torch.uint8: jnp.uint8, - torch.bool: jnp.bool_, - torch.complex64: jnp.complex64, - torch.complex128: jnp.complex128, - }.get(dtype) + if dtype not in TORCH_DTYPE_TO_JAX: + raise RuntimeError(f'Attempting to convert unknown type: {dtype} to torch type,') + return TORCH_DTYPE_TO_JAX[dtype] def j2t_dtype(dtype): - return { - jnp.float16: torch.float16, - jnp.bfloat16: torch.bfloat16, - jnp.double: torch.double, - jnp.float32: torch.float32, - jnp.float16: torch.half, - jnp.int64: torch.long, - jnp.int32: torch.int32, - jnp.int16: torch.int16, - jnp.bool_: torch.bool, - jnp.complex64: torch.complex64, - }.get(dtype) - - -def move_to_device(t): - return XLATensor2(t2j(t)) + if dtype not in JAX_DTYPE_TO_TORCH: + raise RuntimeError(f'Attempting to convert unknown type: {dtype} to torch type,') + return JAX_DTYPE_TO_TORCH[dtype] class XLATensor2(torch.Tensor): @staticmethod - def __new__(cls, elem): + def __new__(cls, elem, env): dtype = j2t_dtype(elem.dtype) shape = list(elem.shape) for i, s in enumerate(shape): @@ -154,9 +112,10 @@ def __new__(cls, elem): requires_grad=False, ) - def __init__(self, elem: jax.Array): + def __init__(self, elem: jax.Array, env: 'Environment'): super().__init__() self._elem = elem + self._env = env def __str__(self): return "XLATensor2({} {})".format(str(type(self._elem)), str(self._elem)) @@ -178,7 +137,7 @@ def flatten(self, start_dim=0, end_dim=-1): new_shape = ( self._elem.shape[:start_dim] + (-1,) + self._elem.shape[end_dim:]) new_elem = jnp.reshape(self._elem, new_shape) - return XLATensor2(new_elem) + return XLATensor2(new_elem, self._env) # return torch.reshape(self, new_shape) def __setitem__(self, key, val): @@ -193,32 +152,17 @@ def type_as(self, other): @classmethod def __torch_dispatch__(cls, func, types, args=(), kwargs=None): - kwargs = kwargs or {} - with jax.named_scope(func.name()): + env = None + for arg in torch_pytree.arg_tree_leaves(*args, **kwargs): + if isinstance(arg, XLATensor2): + env = arg._env + break - if isinstance(func, torch._ops.OpOverloadPacket): - return func(*args, **kwargs) - - if func.name() == 'aten::copy_': - x, y = args - x._elem = y._elem - return - - if func.overloadpacket in jaten.all_ops: - return jaten.all_ops[func.overloadpacket](*args, **kwargs) - - lowering = ops_registry.lowerings.lookup(func) - - if lowering is None: - raise RuntimeError("No lowering found for", func.name()) - - with XLADispatchMode(): - res = lowering(*args, **kwargs) - debug_accuracy(func, args, kwargs, res) - return res + with env: + return func(*args, **(kwargs or {})) def detach(self): - return XLATensor2(jax.lax.stop_gradient(self.jax())) + return XLATensor2(jax.lax.stop_gradient(self.jax()), self._env) def numpy(self) -> numpy.ndarray: import numpy as np @@ -231,6 +175,20 @@ def jax(self) -> jax.Array: def torch(self) -> torch.Tensor: return j2t(self.jax()) + def to(self, *args, **kwargs): + if len(args) == 1: + if isinstance(args[0], torch.dtype): + return XLATensor2(self._elem.astype(t2j_dtype(args[0])), self._env) + if 'dtype' in kwargs: + dtype = kwargs['dtype'] + return XLATensor2(self._elem.astype(t2j_dtype(dtype)), self._env) + return self + + @property + def dtype(self): + return j2t_dtype(self._elem.dtype) + + # TODO: slice of slice should also be another slice class SliceView(XLATensor2): @@ -281,3 +239,159 @@ def debug_accuracy(func, args, kwargs, current_output): pdb.set_trace() return True + + +class XLAFunctionMode(torch.overrides.TorchFunctionMode): + """Context manager that dispatches torch function calls to JAX.""" + + def __init__(self, env): + self.env = env + + def __torch_function__(self, + func, + types, + args=(), + kwargs=None) -> torch.Tensor: + try: + return self.env.dispatch(func, types, args, kwargs) + except OperatorNotFound: + return func(*args, **(kwargs or {})) + + +class XLADispatchMode(torch_dispatch.TorchDispatchMode): + + def __init__(self, env): + self.env = env + + def __torch_dispatch__(self, func, types, args=(), kwargs=None): + if isinstance(func, torch._ops.OpOverloadPacket): + with self: + return func(*args, **kwargs) + if func.namespace != 'aten': + return func(*args, **kwargs) + return self.env.dispatch(func, types, args, kwargs) + +def _name_of_func(func): + if hasattr(func, 'name'): + return func.name() + return func.__name__ + + +class Environment(contextlib.ContextDecorator): + """This class holds a set of configurations and "globals" needed + + for executing torch program using jax. + Things included so far: + + op registry + PRNGKey + Configs + + Also helper functions to manipulate those. + """ + + _prng_key: jax.random.PRNGKey + + + def __init__(self, random_seed): + self._prng_key = jax.random.PRNGKey(random_seed) + self._function_mode = XLAFunctionMode(self) + self._dispatch_mode = XLADispatchMode(self) + + # name is torch callable + self._ops = {} + self.load_ops() + + def load_ops(self): + from torch_xla2.ops import jaten, jtorch, ops_registry + self._ops.update(ops_registry.all_aten_ops) + self._ops.update(ops_registry.all_torch_functions) + + decomps = torch._decomp.core_aten_decompositions() + from torch_xla2.decompositions import EXTRA_DECOMP + decomps.update(EXTRA_DECOMP) + for k, v in decomps.items(): + if k not in self._ops: + self._ops[k] = ops_registry.Operator( + k, + v, + is_jax_function=False, + is_user_defined=False, + needs_env=False + ) + + def get_and_rotate_prng_key(self): + self._prng_key, key = jax.random.split(self._prng_key) + return key + + def dispatch(self, func, types, args, kwargs): + with jax.named_scope(_name_of_func(func)): + kwargs = kwargs or {} + op = self._ops.get(func) + + if op is None and isinstance(func, torch._ops.OpOverloadPacket): + op = self._ops.get(func.default) + + if op is None and isinstance(func, torch._ops.OpOverload): + op = self._ops.get(func.overloadpacket) + + if op is None: + raise OperatorNotFound( + f'Operator with name {_name_of_func(func)} has no lowering') + + if op.is_jax_function: + args, kwargs = self.t2j_iso((args, kwargs)) + + if op.needs_env: + kwargs['env'] = self + + with self: + res = op.func(*args, **kwargs) + + if op.is_jax_function: + res = self.j2t_iso(res) + + #if self.config.debug_accuracy_for_each_op: + # debug_accuracy(func, args, kwargs, res) + return res + + def __enter__(self): + self._dispatch_mode.__enter__() + self._function_mode.__enter__() + return self + + def __exit__(self, *exc): + self._function_mode.__exit__(*exc) + self._dispatch_mode.__exit__(*exc) + + def _move_one_value(self, val): + if isinstance(val, torch.nn.Module): + state_dict = self.to_xla(val.state_dict()) + val.load_state_dict(state_dict, assign=True) + return val + if isinstance(val, XLATensor2): + return val + if isinstance(val, torch.Tensor): + return XLATensor2(t2j(val), self) + return val + + def to_xla(self, torchvalues): + # tensors are torch.Tensors (not XLATensor) + res = torch_pytree.tree_map( + self._move_one_value, + torchvalues) + return res + + def t2j_iso(self, torchtensors): + return torch_pytree.tree_map_only( + XLATensor2, lambda x: x.jax(), torchtensors) + + def j2t_iso(self, jaxarray): + return torch_pytree.tree_map_only( + jnp.ndarray, lambda x: XLATensor2(x, self), jaxarray) + + def j2t_copy(self, args): + pass + + def j2t_copy(self, args): + pass diff --git a/experimental/torch_xla2/torch_xla2/types.py b/experimental/torch_xla2/torch_xla2/types.py new file mode 100644 index 00000000000..f39d530c18d --- /dev/null +++ b/experimental/torch_xla2/torch_xla2/types.py @@ -0,0 +1,12 @@ +from typing import TypeAlias, Callable, ParamSpec, Any, Union +import torch +import jax +import jax.numpy as jnp + + +P = ParamSpec('P') + +TorchValue: TypeAlias = Union[torch.Tensor, torch.dtype, 'TorchCallable', Any] +TorchCallable: TypeAlias = Callable[P, TorchValue] +JaxValue: TypeAlias = Union[jax.Array, jnp.dtype, 'JaxCallable', Any] +JaxCallable: TypeAlias = Callable[P, JaxValue] \ No newline at end of file