pytorch · bhavya01 · Jun 7, 2024 · Oct 4, 2023 · Mar 20, 2024 · Mar 20, 2024
diff --git a/.github/workflows/_build_torch_with_cuda.yml b/.github/workflows/_build_torch_with_cuda.yml
@@ -7,49 +7,61 @@ on:
         type: string
         description: Base image for builds
       torch-commit:
-          required: true
-          type: string
-          description: torch-commit
+        required: true
+        type: string
+        description: torch-commit
       runner:
         required: false
         type: string
         description: Runner type for the test
         default: linux.12xlarge
+    secrets:
+      gcloud-service-key:
+        required: true
+        description: Secret to access Bazel build cache
 jobs:
   build:
     runs-on: ${{ inputs.runner }}
     container:
       image: ${{ inputs.dev-image }}
-      options: "--gpus all --shm-size 16g"
     env:
+      GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }}
+      GOOGLE_APPLICATION_CREDENTIALS: /tmp/default_credentials.json
+      BAZEL_JOBS: 16
+      BAZEL_REMOTE_CACHE: 1
       _GLIBCXX_USE_CXX11_ABI: 0
     steps:
       # See https://github.com/actions/checkout/issues/1014#issuecomment-1906802802
       - name: Clean up workspace
         run: |
           ls -la
           rm -rvf ${GITHUB_WORKSPACE}/*
+      - name: Setup gcloud
+        shell: bash
+        run: |
+          echo "${GCLOUD_SERVICE_KEY}" > $GOOGLE_APPLICATION_CREDENTIALS
       - name: Setup CUDA environment
         shell: bash
         run: |
           echo "PATH=$PATH:/usr/local/cuda-12.1/bin" >> $GITHUB_ENV
           echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.1/lib64" >> $GITHUB_ENV
-      - name: Check GPU
-        run: nvidia-smi
       - name: Checkout PyTorch Repo
         uses: actions/checkout@v4
         with:
           repository: pytorch/pytorch
           path: pytorch
-          ref: ${{ inputs.torch-commit }}
           submodules: recursive
+      - name: Checkout PyTorch/XLA Repo
+        uses: actions/checkout@v4
+        with:
+          path: pytorch/xla
       - name: Build
         shell: bash
         run: |
-          cd pytorch
-          USE_CUDA=1 python setup.py bdist_wheel
+          cd pytorch/xla/infra/ansible
+          ansible-playbook playbook.yaml -vvv -e "stage=build arch=amd64 accelerator=cuda src_root=${GITHUB_WORKSPACE} cuda_compute_capabilities=compute_86 torch_cuda_arch_list=8.6 build_pytorch_with_cuda=1 bundle_libtpu=0 build_cpp_tests=1 git_versioned_xla_build=1 cache_suffix=-ci" --skip-tags=fetch_srcs,install_deps
       - name: Upload wheel
         uses: actions/upload-artifact@v4
         with:
-          name: torch-with-cuda
-          path: pytorch/dist/*.whl
+          name: torch-cuda-wheels
+          path: /dist/*.whl
diff --git a/.github/workflows/_test_requiring_torch_cuda.yml b/.github/workflows/_test_requiring_torch_cuda.yml
@@ -19,7 +19,7 @@ on:
       timeout-minutes:
         required: false
         type: number
-        default: 30
+        default: 270
         description: |
           Set the maximum (in minutes) how long the workflow should take to finish
             timeout-minutes:
@@ -46,30 +46,14 @@ jobs:
         run: |
           ls -la
           rm -rvf ${GITHUB_WORKSPACE}/*
-      - name: Fetch torch/torch_xla/torchvision wheels
-        uses: actions/download-artifact@v4
-        with:
-          name: torch-xla-wheels
-          path: /tmp/wheels/
-      - name: Remove torch wheel built with CUDA disabled
-        shell: bash
-        run: |
-          rm -rf /tmp/wheels/torch-*
       - name: Fetch the torch wheel built with CUDA enabled
         uses: actions/download-artifact@v4
         with:
-          name: torch-with-cuda
-          path: /tmp/wheels/
-      - name: Fetch CUDA plugin
-        uses: actions/download-artifact@v4
-        with:
-          name: cuda-plugin
+          name: torch-cuda-wheels
           path: /tmp/wheels/
       - name: Setup CUDA environment
         shell: bash
         run: |
-          echo "XLA_REGISTER_INSTALLED_PLUGINS=1" >> $GITHUB_ENV
-
           echo "PATH=$PATH:/usr/local/cuda-12.1/bin" >> $GITHUB_ENV
           echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.1/lib64" >> $GITHUB_ENV
       - name: Check GPU
@@ -81,7 +65,6 @@ jobs:
           # TODO: Add these in setup.py
           pip install fsspec
           pip install rich
-
           echo "Import check..."
           python -c "import torch, torch_xla, torchvision"
           echo "Import check done."
@@ -98,9 +81,19 @@ jobs:
         uses: actions/checkout@v4
         with:
           path: pytorch/xla
+      - name: Extra CI deps
+        shell: bash
+        run: |
+          set -x
+          pip install expecttest unittest-xml-reporting
+          pip install -U --pre jaxlib -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+          pip install -U --pre jax-cuda12-pjrt jax-cuda12-plugin -f https://storage.googleapis.com/jax-releases/jax_cuda_plugin_nightly_releases.html
+          pip install -U --pre jax -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
+          pip install --no-deps triton==2.3.0
       - name: Test
         shell: bash
         run: |
           set -xue
+          PJRT_DEVICE=CUDA TRITON_PTXAS_PATH=/usr/local/cuda-12.1/bin/ptxas python pytorch/xla/test/test_triton.py
           PJRT_DEVICE=CUDA python pytorch/xla/test/test_operations.py -v
-          PJRT_DEVICE=CUDA python pytorch/xla/test/dynamo/test_dynamo.py -v
+          PJRT_DEVICE=CUDA python pytorch/xla/test/dynamo/test_dynamo.py -v
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -42,7 +42,9 @@ jobs:
       # note that to build a torch wheel with CUDA enabled, we do not need a GPU runner.
       dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1
       torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
-      runner: linux.8xlarge.nvidia.gpu
+      runner: linux.24xlarge
+    secrets:
+      gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
   build-cuda-plugin:
     name: "Build XLA CUDA plugin"
@@ -52,6 +54,17 @@ jobs:
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
+  test-cuda-with-pytorch-cuda-enabled:
+    name: "GPU tests requiring torch CUDA"
+    uses: ./.github/workflows/_test_requiring_torch_cuda.yml
+    needs: [build-torch-with-cuda, get-torch-commit]
+    with:
+      dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1
+      runner: linux.g5.4xlarge.nvidia.gpu
+      timeout-minutes: 300
+      collect-coverage: false
+      torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
+
   test-python-cpu:
     name: "CPU tests"
     uses: ./.github/workflows/_test.yml
@@ -78,17 +91,6 @@ jobs:
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
-  test-cuda-with-pytorch-cuda-enabled:
-    name: "GPU tests requiring torch CUDA"
-    uses: ./.github/workflows/_test_requiring_torch_cuda.yml
-    needs: [build-torch-with-cuda, build-torch-xla, build-cuda-plugin, get-torch-commit]
-    with:
-      dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1
-      runner: linux.8xlarge.nvidia.gpu
-      timeout-minutes: 300
-      collect-coverage: false
-      torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
-
   test-tpu:
     name: "TPU tests"
     uses: ./.github/workflows/_tpu_ci.yml

diff --git a/infra/ansible/config/env.yaml b/infra/ansible/config/env.yaml
@@ -42,7 +42,9 @@ build_env:
 
   cuda:
     TF_CUDA_COMPUTE_CAPABILITIES: "{{ cuda_compute_capabilities }}"
+    TORCH_CUDA_ARCH_LIST: "{{ torch_cuda_arch_list | default('') }}"
     XLA_CUDA: 1
+    USE_CUDA: "{{ build_pytorch_with_cuda | default(0) }}"
 
   tpu:
     ACCELERATOR: tpu

diff --git a/infra/ansible/roles/build_srcs/tasks/main.yaml b/infra/ansible/roles/build_srcs/tasks/main.yaml
@@ -27,8 +27,7 @@
     cmd: python setup.py bdist_wheel
     chdir: "{{ (src_root, 'pytorch') | path_join }}"
     creates: "{{ (src_root, 'pytorch/dist/*.whl') | path_join }}"
-  # Set `USE_CUDA=0` as PyTorch cannot be used with GPU in eager and XLA mode.
-  environment: "{{ env_vars | combine({'USE_CUDA': 0}) }}"
+  environment: "{{ env_vars }}"
 
 - name: Find PyTorch *.whl files in pytorch/dist
   ansible.builtin.find:

diff --git a/test/test_operations.py b/test/test_operations.py
@@ -2711,18 +2711,6 @@ def test_dlpack_pytorch_cuda_to_xla(self):
     t2_cuda.fill_(6)
     self.assertTrue(torch.allclose(xla_t2.cpu(), t2_cuda.cpu()))
 
-    cuda1 = torch.device('cuda:1')
-    t3_cuda = torch.tensor(5, device=cuda1)
-    dlt3 = torch.utils.dlpack.to_dlpack(t3_cuda)
-    xla_t3 = xdlpack.from_dlpack(dlt3)
-    self.assertEqual(xla_t3.device.type, 'xla')
-    self.assertEqual(
-        xla_t3.device.index,
-        t3_cuda.device.index,
-        msg='both value should 1. xla_t3.device should be xla:1.')
-    t3_cuda.fill_(6)
-    self.assertTrue(torch.allclose(xla_t3.cpu(), t3_cuda.cpu()))
-
   @onlyIfTorchSupportsCUDA
   @onlyIfPJRTDeviceIsCUDA
   def test_dlpack_pytorch_cuda_to_xla_protocol_conversion(self):
@@ -2743,17 +2731,6 @@ def test_dlpack_pytorch_cuda_to_xla_protocol_conversion(self):
     t2_cuda.fill_(6)
     self.assertTrue(torch.allclose(xla_t2.cpu(), t2_cuda.cpu()))
 
-    cuda1 = torch.device('cuda:1')
-    t3_cuda = torch.tensor(5, device=cuda1)
-    xla_t3 = xdlpack.from_dlpack(t3_cuda)
-    self.assertEqual(xla_t3.device.type, 'xla')
-    self.assertEqual(
-        xla_t3.device.index,
-        t3_cuda.device.index,
-        msg='both value should 1. xla_t3.device should be xla:1.')
-    t3_cuda.fill_(6)
-    self.assertTrue(torch.allclose(xla_t3.cpu(), t3_cuda.cpu()))
-
   @onlyIfTorchSupportsCUDA
   @onlyIfPJRTDeviceIsCUDA
   def test_dlpack_xla_to_pytorch_cuda(self):

diff --git a/test/test_triton.py b/test/test_triton.py
@@ -0,0 +1,68 @@
+import logging
+import torch
+from torch import nn as nn
+import unittest
+
+import torch_xla.experimental.triton as xla_triton
+import torch_xla
+from torch_xla import runtime as xr
+
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def add_kernel(
+    x_ptr,  # *Pointer* to first input vector.
+    y_ptr,  # *Pointer* to second input vector.
+    output_ptr,  # *Pointer* to output vector.
+    n_elements,  # Size of the vector.
+    BLOCK_SIZE: tl.constexpr,  # Number of elements each program should process.
+    # NOTE: `constexpr` so it can be used as a shape value.
+):
+  # Triton add kernel from https://github.com/openai/triton/blob/main/python/tutorials/01-vector-add.py#L28
+  # There are multiple 'programs' processing different data. We identify which program
+  # we are here:
+  pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0.
+  # This program will process inputs that are offset from the initial data.
+  # For instance, if you had a vector of length 256 and block_size of 64, the programs
+  # would each access the elements [0:64, 64:128, 128:192, 192:256].
+  # Note that offsets is a list of pointers:
+  block_start = pid * BLOCK_SIZE
+  offsets = block_start + tl.arange(0, BLOCK_SIZE)
+  # Create a mask to guard memory operations against out-of-bounds accesses.
+  mask = offsets < n_elements
+  # Load x and y from DRAM, masking out any extra elements in case the input is not a
+  # multiple of the block size.
+  x = tl.load(x_ptr + offsets, mask=mask)
+  y = tl.load(y_ptr + offsets, mask=mask)
+  output = x + y
+  # Write x + y back to DRAM.
+  tl.store(output_ptr + offsets, output, mask=mask)
+
+
+class TritonTest(unittest.TestCase):
+
+  @unittest.skipIf(xr.device_type() != 'CUDA', "This test only works on GPU.")
+  def test_gpu_custom_call_triton_add(self):
+    size = 16
+
+    x = torch.arange(size, dtype=torch.int64).to("xla")
+    y = torch.arange(size, dtype=torch.int64).to("xla")
+    output = torch.empty_like(x)
+    block_size = 8
+    grid = (triton.cdiv(size, block_size),)
+    payload = xla_triton.triton_call(
+        x, y, output, size, kernel=add_kernel, grid=grid, BLOCK_SIZE=block_size)
+    output = torch_xla._XLAC._xla_gpu_custom_call([x, y], payload,
+                                                  [output.shape], [torch.int64])
+    output_torch = x + y
+    self.assertTrue(torch.allclose(output[0].cpu(), output_torch.cpu()))
+
+
+if __name__ == '__main__':
+  logging.getLogger().setLevel(logging.INFO)
+  torch.set_default_dtype(torch.float32)
+  torch.manual_seed(42)
+  test = unittest.main()
+  sys.exit(0 if test.result.wasSuccessful() else 1)
diff --git a/torch_xla/csrc/BUILD b/torch_xla/csrc/BUILD
@@ -289,6 +289,7 @@ ptxla_cc_library(
         "@xla//xla/service:hlo_verifier",
         "@xla//xla/service:sharding_propagation",
         "@xla//xla/service/spmd:spmd_partitioner",
+        "@xla//xla/service:custom_call_target_registry",
     ],
 )
 

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
@@ -69,6 +69,7 @@
 #include "tsl/profiler/lib/traceme.h"
 #include "xla/pjrt/distributed/distributed.h"
 #include "xla/python/profiler/internal/traceme_wrapper.h"
+#include "xla/service/custom_call_target_registry.h"
 #include "xla/service/hlo_parser.h"
 
 namespace torch_xla {
@@ -202,6 +203,24 @@ std::vector<std::vector<int64_t>> CreateReduceGroups(const py::list& groups) {
   return replica_groups;
 }
 
+std::vector<at::Tensor> XlaCustomCall(
+    const std::vector<at::Tensor>& inputs, const std::string& payload,
+    const std::vector<std::vector<int64_t>>& output_shapes,
+    const std::vector<py::object>& output_dtypes, bool is_tpu) {
+  std::vector<at::ScalarType> dtypes;
+  dtypes.reserve(output_dtypes.size());
+  for (auto& dtype : output_dtypes) {
+    dtypes.push_back(reinterpret_cast<THPDtype*>(dtype.ptr())->scalar_type);
+  }
+
+  if (is_tpu) {
+    return bridge::AtenFromXlaTensors(tensor_methods::tpu_custom_call(
+        bridge::GetXlaTensors(inputs), payload, output_shapes, dtypes));
+  }
+  return bridge::AtenFromXlaTensors(tensor_methods::gpu_custom_call(
+      bridge::GetXlaTensors(inputs), payload, output_shapes, dtypes));
+}
+
 std::vector<std::pair<int64_t, int64_t>> CreateSourceTargetPairs(
     const py::list& pairs) {
   std::vector<std::pair<int64_t, int64_t>> source_target_pairs;
@@ -2401,16 +2420,22 @@ void InitXlaModuleBindings(py::module m) {
            const std::vector<std::vector<int64_t>>& output_shapes,
            const std::vector<py::object>& output_dtypes)
             -> std::vector<at::Tensor> {
-          std::vector<at::ScalarType> dtypes;
-          dtypes.reserve(output_dtypes.size());
-          for (auto& dtype : output_dtypes) {
-            dtypes.push_back(
-                reinterpret_cast<THPDtype*>(dtype.ptr())->scalar_type);
-          }
-
-          auto xtensors = tensor_methods::tpu_custom_call(
-              bridge::GetXlaTensors(inputs), payload, output_shapes, dtypes);
-          return bridge::AtenFromXlaTensors(xtensors);
+          return XlaCustomCall(inputs, payload, output_shapes, output_dtypes,
+                               /*is_tpu=*/true);
+        });
+  m.def("_xla_gpu_custom_call",
+        [](const std::vector<at::Tensor>& inputs, const std::string& payload,
+           const std::vector<std::vector<int64_t>>& output_shapes,
+           const std::vector<py::object>& output_dtypes)
+            -> std::vector<at::Tensor> {
+          return XlaCustomCall(inputs, payload, output_shapes, output_dtypes,
+                               /*is_tpu=*/false);
+        });
+  m.def("_xla_register_custom_call_target",
+        [](const std::string& fn_name, const py::capsule& function_ptr,
+           const std::string& platform) {
+          XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(
+              fn_name, function_ptr.get_pointer(), platform);
         });
   m.def("_set_xla_custom_op_name_prefix",
         [](const at::Tensor& input, const std::string& op_name_prefix,