From 75764072f9c88de497b793bd40749330bd7d86cf Mon Sep 17 00:00:00 2001
From: iefgnoix <isaacwxf23@gmail.com>
Date: Thu, 23 May 2024 23:12:43 +0000
Subject: [PATCH] merge the test script

---
 .github/workflows/_test.yml                   |  21 ++++
 .../workflows/_test_requiring_torch_cuda.yml  | 118 ------------------
 .github/workflows/build_and_test.yml          |   6 +-
 3 files changed, 25 insertions(+), 120 deletions(-)
 delete mode 100644 .github/workflows/_test_requiring_torch_cuda.yml

diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
index 8a454cc075bc..59a86db74d99 100644
--- a/.github/workflows/_test.yml
+++ b/.github/workflows/_test.yml
@@ -28,6 +28,11 @@ on:
         type: boolean
         default: false
         description: Whether to install CUDA plugin package
+      run-tests-requiring-torch-cuda:
+        required: false
+        type: boolean
+        default: false
+        description: Whether to run tests that requires torch with CUDA enabled
 
     secrets:
       gcloud-service-key:
@@ -87,6 +92,14 @@ jobs:
         with:
           name: torch-xla-wheels
           path: /tmp/wheels/
+      # The step below will overwrite the torch wheel
+      # if run-tests-requiring-torch-cuda is true.
+      - name: Fetch torch CUDA wheel
+        uses: actions/download-artifact@v4
+        with:
+          name: torch-with-cuda-xla-with-cuda-wheels
+          path: /tmp/wheels/
+        if: ${{ inputs.run-tests-requiring-torch-cuda }}
       - name: Fetch CPP test binaries
         uses: actions/download-artifact@v4
         with:
@@ -159,6 +172,14 @@ jobs:
       - name: Test
         shell: bash
         run: pytorch/xla/.github/scripts/run_tests.sh pytorch/ pytorch/xla/ $USE_COVERAGE
+        if: ! ${{ inputs.run-tests-requiring-torch-cuda }}
+      - name: Test that requires torch with CUDA enabled
+        shell: bash
+        run: |
+          set -xue
+          PJRT_DEVICE=CUDA python pytorch/xla/test/test_operations.py -v
+          PJRT_DEVICE=CUDA python pytorch/xla/test/dynamo/test_dynamo.py -v
+        if: ${{ inputs.run-tests-requiring-torch-cuda }}
       - name: Upload coverage results
         if: ${{ inputs.collect-coverage }}
         shell: bash
diff --git a/.github/workflows/_test_requiring_torch_cuda.yml b/.github/workflows/_test_requiring_torch_cuda.yml
deleted file mode 100644
index 40f1205c7ba6..000000000000
--- a/.github/workflows/_test_requiring_torch_cuda.yml
+++ /dev/null
@@ -1,118 +0,0 @@
-name: xla-test-requiring-torch-cuda
-on:
-  workflow_call:
-    inputs:
-      dev-image:
-        required: true
-        type: string
-        description: Base image for builds
-      runner:
-        required: false
-        type: string
-        description: Runner type for the test
-        default: linux.12xlarge
-      collect-coverage:
-        required: false
-        type: boolean
-        description: Set to true to collect coverage information
-        default: false
-      timeout-minutes:
-        required: false
-        type: number
-        default: 270
-        description: |
-          Set the maximum (in minutes) how long the workflow should take to finish
-            timeout-minutes:
-
-    secrets:
-      gcloud-service-key:
-        required: true
-        description: Secret to access Bazel build cache
-jobs:
-  test:
-    runs-on: ${{ inputs.runner }}
-    container:
-      image: ${{ inputs.dev-image }}
-      options: "--gpus all --shm-size 16g"
-    timeout-minutes: ${{ inputs.timeout-minutes }}
-    env:
-      GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }}
-      GOOGLE_APPLICATION_CREDENTIALS: /tmp/default_credentials.json
-      USE_COVERAGE: ${{ inputs.collect-coverage && '1' || '0' }}
-      BAZEL_JOBS: 16
-      BAZEL_REMOTE_CACHE: 1
-    steps:
-      # See https://github.com/actions/checkout/issues/1014#issuecomment-1906802802
-      - name: Clean up workspace
-        run: |
-          ls -la
-          rm -rvf ${GITHUB_WORKSPACE}/*
-      - name: Setup gcloud
-        shell: bash
-        run: |
-          echo "${GCLOUD_SERVICE_KEY}" > $GOOGLE_APPLICATION_CREDENTIALS
-      - name: Fetch torch/torch_xla/torchvision wheels with CUDA enabled
-        uses: actions/download-artifact@v4
-        with:
-          name: torch-with-cuda-xla-with-cuda-wheels
-          path: /tmp/wheels/
-          pattern: torch-*.whl 
-      - name: Fetch CUDA plugin
-        uses: actions/download-artifact@v4
-        with:
-          name: cuda-plugin
-          path: /tmp/wheels/
-      - name: Setup CUDA environment
-        shell: bash
-        run: |
-          # TODO: Make PJRT_DEVICE=CPU work with XLA_REGISTER_INSTALLED_PLUGINS=1
-          echo "XLA_REGISTER_INSTALLED_PLUGINS=1" >> $GITHUB_ENV
-
-          echo "PATH=$PATH:/usr/local/cuda-12.1/bin" >> $GITHUB_ENV
-          echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.1/lib64" >> $GITHUB_ENV
-      - name: Check GPU
-        run: nvidia-smi
-      - name: Install wheels
-        shell: bash
-        run: |
-          pip install /tmp/wheels/*.whl
-          # TODO: Add these in setup.py
-          pip install fsspec
-          pip install rich
-
-          echo "Import check..."
-          python -c "import torch, torch_xla, torchvision"
-          echo "Import check done."
-          echo "Check if CUDA is available for PyTorch..."
-          python -c "import torch; assert torch.cuda.is_available()"
-          echo "CUDA is available for PyTorch."
-      - name: Record PyTorch commit
-        run: |
-          # Don't just pipe output in shell because imports may do extra logging
-          python -c "
-          import torch_xla.version
-          with open('$GITHUB_ENV', 'a') as f:
-            f.write(f'PYTORCH_COMMIT={torch_xla.version.__torch_gitrev__}\n')
-          "
-      - name: Checkout PyTorch Repo
-        uses: actions/checkout@v4
-        with:
-          repository: pytorch/pytorch
-          path: pytorch
-          ref: ${{ env.PYTORCH_COMMIT }}
-      - name: Checkout PyTorch/XLA Repo
-        uses: actions/checkout@v4
-        with:
-          path: pytorch/xla
-      - name: Extra CI deps
-        shell: bash
-        run: |
-          set -x
-
-          pip install expecttest unittest-xml-reporting
-      - name: Test
-        shell: bash
-        run: |
-          set -xue
-          PJRT_DEVICE=CUDA python pytorch/xla/test/test_operations.py -v
-          PJRT_DEVICE=CUDA python pytorch/xla/test/dynamo/test_dynamo.py -v
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 71b86985911e..f0c0545f3602 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -71,13 +71,15 @@ jobs:
 
   test-cuda-with-pytorch-cuda-enabled:
     name: "GPU tests with PyTorch CUDA enabled"
-    uses: ./.github/workflows/_test_requiring_torch_cuda.yml
-    needs: [build-torch-with-cuda-xla-with-cuda, build-cuda-plugin]
+    uses: ./.github/workflows/_test.yml
+    needs: [build-torch-with-cuda-xla-with-cuda, build-torch-xla, build-cuda-plugin]
     with:
       dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1
       runner: linux.8xlarge.nvidia.gpu
       timeout-minutes: 300
       collect-coverage: false
+      install-cuda-plugin: true
+      run-tests-requiring-torch-cuda: true
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}