diff --git a/.github/workflows/gpu-hvd-tests.yml b/.github/workflows/gpu-hvd-tests.yml new file mode 100644 index 00000000000..c3658b10282 --- /dev/null +++ b/.github/workflows/gpu-hvd-tests.yml @@ -0,0 +1,188 @@ +name: Run HVD-specific unit tests on GPUs +on: + push: + paths: + - "ignite/**" + - "tests/ignite/**" + - "tests/run_gpu_tests.sh" + - "tests/run_code_style.sh" + - "examples/**.py" + - "requirements-dev.txt" + - ".github/workflows/gpu-hvd-tests.yml" + workflow_dispatch: + +concurrency: + # -- + group: gpu-hvd-tests-${{ github.ref_name }}-${{ !(github.ref_protected) || github.sha }} + cancel-in-progress: true + +# Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job.yml + +jobs: + gpu-hvd-tests: + strategy: + matrix: + pytorch-channel: [pytorch, ] + fail-fast: false + env: + DOCKER_IMAGE: "pytorch/conda-builder:cuda11.8" + REPOSITORY: ${{ github.repository }} + PR_NUMBER: ${{ github.event.pull_request.number }} + runs-on: linux.8xlarge.nvidia.gpu + timeout-minutes: 60 + + steps: + - name: Clean workspace + run: | + echo "::group::Cleanup debug output" + sudo rm -rfv "${GITHUB_WORKSPACE}" + mkdir -p "${GITHUB_WORKSPACE}" + echo "::endgroup::" + + - name: Checkout repository (pytorch/test-infra) + uses: actions/checkout@v3 + with: + # Support the use case where we need to checkout someone's fork + repository: pytorch/test-infra + path: test-infra + + - name: Setup Linux + uses: ./test-infra/.github/actions/setup-linux + + - name: Pull docker image + uses: ./test-infra/.github/actions/pull-docker-image + with: + docker-image: ${{ env.DOCKER_IMAGE }} + + - name: Checkout repository (${{ github.repository }}) + uses: actions/checkout@v3 + with: + # Support the use case where we need to checkout someone's fork + repository: ${{ github.repository }} + ref: ${{ github.ref }} + path: ${{ github.repository }} + fetch-depth: 1 + + - name: Start Pytorch container + working-directory: ${{ github.repository }} + run: | + docker run --name pthd --gpus=all --rm \ + --cap-add=SYS_PTRACE \ + --detach \ + --ipc=host \ + --security-opt seccomp=unconfined \ + --shm-size=2g \ + --tty \ + --ulimit stack=10485760:83886080 \ + -v $PWD:/work \ + -w /work \ + ${DOCKER_IMAGE} + + script=$(cat << EOF + + set -xe + + nvidia-smi + ls -alh + + conda --version + python --version + + EOF + ) + docker exec -t pthd /bin/bash -c "${script}" + + - name: Install PyTorch and dependencies + continue-on-error: false + run: | + + script=$(cat << EOF + + set -xe + + # Install PyTorch + if [ "${{ matrix.pytorch-channel }}" == "pytorch" ]; then + pip install --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cu118 + else + pip install --upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu118 + fi + + python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())" + pip list + + # Install dependencies + pip install -r requirements-dev.txt + pip install -e . + + EOF + ) + + docker exec -t pthd /bin/bash -c "${script}" + + - name: Install Horovod with NCCL GPU ops + run: | + script=$(cat << EOF + + set -xe + + HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch] + horovodrun --check-build + pip list + + EOF + ) + + docker exec -t pthd /bin/bash -c "${script}" + + - name: Run GPU and CPU Unit HVD Tests + run: | + + script=$(cat << EOF + + set -xe + + bash tests/run_gpu_tests.sh 2 hvd + CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k hvd + + EOF + ) + + docker exec -t pthd /bin/bash -c "${script}" + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v3 + with: + file: ${{ github.repository }}/coverage.xml + flags: gpu-2 + fail_ci_if_error: false + + - name: Run examples in container + continue-on-error: false + run: | + SCRIPT=$(cat << EOF + + set -xe + + # Install additional example dependencies + pip install fire + + # Check training on CIFAR10, run with horovod backend using horovodrun + # initial run + CI=1 horovodrun -np 2 python -u examples/cifar10/main.py run --backend=horovod --checkpoint_every=200 --stop_iteration=500 + # resume + CI=1 horovodrun -np 2 python examples/cifar10/main.py run --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt + + # Check training on CIFAR10 using spawn + # initial run + CI=1 python -u examples/cifar10/main.py run --backend=horovod --nproc_per_node=2 --checkpoint_every=200 --stop_iteration=500 + # resume + CI=1 python -u examples/cifar10/main.py run --backend=horovod --nproc_per_node=2 --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt + + EOF + ) + + docker exec -t pthd /bin/bash -c "${script}" + + - name: Teardown Linux + if: ${{ always() }} + uses: ./test-infra/.github/actions/teardown-linux diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 93aae8c60e7..efd3682c09e 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -19,7 +19,7 @@ concurrency: # Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job.yml jobs: - gpu-tests: + gpu-tests: strategy: matrix: pytorch-channel: [pytorch, pytorch-nightly] @@ -80,7 +80,7 @@ jobs: script=$(cat << EOF - set -x + set -xe nvidia-smi ls -alh @@ -98,7 +98,7 @@ jobs: script=$(cat << EOF - set -x + set -xe # Install PyTorch if [ "${{ matrix.pytorch-channel }}" == "pytorch" ]; then @@ -119,13 +119,13 @@ jobs: docker exec -t pthd /bin/bash -c "${script}" - - name: Run 1 Node 2 GPUs Unit Tests + - name: Run GPU Unit Tests continue-on-error: false run: | script=$(cat << EOF - set -x + set -xe bash tests/run_gpu_tests.sh 2 @@ -145,8 +145,8 @@ jobs: continue-on-error: false run: | SCRIPT=$(cat << EOF - - set -x + + set -xe # Install additional example dependencies pip install fire @@ -156,7 +156,7 @@ jobs: CI=1 python examples/cifar10/main.py run --checkpoint_every=200 --stop_iteration=500 ## resume CI=1 python examples/cifar10/main.py run --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt - + # Check training on cifar10, run with NCCL backend using torchrun ## initial run CI=1 torchrun --nproc_per_node=2 examples/cifar10/main.py run --backend=nccl --checkpoint_every=200 --stop_iteration=500 diff --git a/tests/ignite/distributed/comp_models/test_horovod.py b/tests/ignite/distributed/comp_models/test_horovod.py index 5f840d51615..264813cd584 100644 --- a/tests/ignite/distributed/comp_models/test_horovod.py +++ b/tests/ignite/distributed/comp_models/test_horovod.py @@ -184,7 +184,7 @@ def _test__hvd_dist_model_warning_index_less_localrank(): @pytest.mark.distributed @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Skip if less than 2 GPUs") def test__hvd_dist_model_warning_index_less_localrank(gloo_hvd_executor): - gloo_hvd_executor(_test__hvd_dist_model_warning_index_less_localrank, (), num_proc=torch.cuda.device_count()) + gloo_hvd_executor(_test__hvd_dist_model_warning_index_less_localrank, (), np=torch.cuda.device_count()) def _test_dist_spawn_fn(local_rank, backend, world_size, device):