From e3d88820bc6c807038ff4345649923a2af736350 Mon Sep 17 00:00:00 2001 From: Julian Qian Date: Wed, 21 Aug 2024 15:36:47 -0700 Subject: [PATCH] only test hvd on GPU --- .../python/keras/callbacks_test.py | 18 +++++------- .../kernel_tests/horovod_sync_train_test.py | 5 ++-- tools/docker/cpu_tests.Dockerfile | 4 --- tools/testing/build_and_run_tests.sh | 29 ++++++++++--------- 4 files changed, 26 insertions(+), 30 deletions(-) diff --git a/tensorflow_recommenders_addons/dynamic_embedding/python/keras/callbacks_test.py b/tensorflow_recommenders_addons/dynamic_embedding/python/keras/callbacks_test.py index 6bc142a7c..d6feab20c 100644 --- a/tensorflow_recommenders_addons/dynamic_embedding/python/keras/callbacks_test.py +++ b/tensorflow_recommenders_addons/dynamic_embedding/python/keras/callbacks_test.py @@ -1,15 +1,13 @@ import pytest +from tensorflow.python.framework import test_util + from tensorflow_recommenders_addons.dynamic_embedding.python.keras.callbacks import \ DEHvdBroadcastGlobalVariablesCallback -@pytest.fixture -def broadcast_callback(root_rank=0, device='/gpu:0'): - # Instantiate with the corrected parameters - return DEHvdBroadcastGlobalVariablesCallback(root_rank=root_rank, - device=device) - - -def test_on_batch_end_subsequent_calls(broadcast_callback): - broadcast_callback.broadcast_done = True - broadcast_callback.on_batch_end(1) +def test_on_batch_end_subsequent_calls(): + if test_util.is_gpu_available(): + broadcast_callback = DEHvdBroadcastGlobalVariablesCallback(root_rank=0, + device='/gpu:0') + broadcast_callback.broadcast_done = True + broadcast_callback.on_batch_end(1) diff --git a/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/horovod_sync_train_test.py b/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/horovod_sync_train_test.py index c1be9234c..de768e12a 100644 --- a/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/horovod_sync_train_test.py +++ b/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/horovod_sync_train_test.py @@ -33,12 +33,10 @@ from tensorflow.python.framework import test_util from tensorflow.python.framework.errors_impl import NotFoundError from tensorflow.python.ops import math_ops -from tensorflow.python.ops import variables from tensorflow.python.platform import test from tensorflow.python.training import adam from tensorflow.python.training import checkpoint_management from tensorflow.python.training import monitored_session -from tensorflow.python.training.optimizer import Optimizer as tf1_opt from tensorflow.python.training import training_util try: from tensorflow.keras.optimizers.legacy import Adam @@ -77,7 +75,8 @@ def test_adam_minimize_trainable(self): self.skipTest( "Apple silicon devices don't support synchronous training based on Horovod." ) - + if not is_gpu_available: + self.skipTest('Only test when gpu is available.') base_opt = adam.AdamOptimizer(1.0) test_opt = adam.AdamOptimizer(1.0) self.common_minimize_trainable_v1(base_opt, test_opt, name="adam") diff --git a/tools/docker/cpu_tests.Dockerfile b/tools/docker/cpu_tests.Dockerfile index 0fba896ad..93adefe93 100644 --- a/tools/docker/cpu_tests.Dockerfile +++ b/tools/docker/cpu_tests.Dockerfile @@ -4,7 +4,6 @@ FROM python:3.9 as build_wheel ARG TF_VERSION="2.15.1" ARG PY_VERSION="3.9" ARG MPI_VERSION="4.1.1" -ARG HOROVOD_VERSION="0.28.1" RUN pip install --upgrade pip RUN pip install --default-timeout=1000 tensorflow==$TF_VERSION @@ -16,9 +15,6 @@ RUN apt-get update && apt-get install -y sudo rsync cmake openmpi-bin libopenmpi COPY tools/install_deps/install_bazelisk.sh /install/ RUN bash /install/install_bazelisk.sh -COPY tools/docker/install/install_horovod.sh /install/ -RUN /install/install_horovod.sh $HOROVOD_VERSION --only-cpu - COPY requirements.txt ./ RUN pip install -r requirements.txt diff --git a/tools/testing/build_and_run_tests.sh b/tools/testing/build_and_run_tests.sh index 3bef498ce..59cdb74ca 100644 --- a/tools/testing/build_and_run_tests.sh +++ b/tools/testing/build_and_run_tests.sh @@ -49,19 +49,22 @@ if ! [ -x "$(command -v nvidia-smi)" ]; then EXTRA_ARGS="-n auto" fi -# Lack of HorovodJoin CPU kernels when install Horovod with NCCL -if [ "$(uname)" != "Darwin" ]; then - # Mac only with MPI - python -m pip uninstall horovod -y - bash /install/install_horovod.sh $HOROVOD_VERSION --only-cpu -fi -# TODO(jamesrong): Test on GPU. -CUDA_VISIBLE_DEVICES="" mpirun -np 2 -H localhost:2 --allow-run-as-root pytest -v ./tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/horovod_sync_train_test.py -# Reinstall Horovod after tests -if [ "$(uname)" != "Darwin" ]; then - # Mac only with MPI - python -m pip uninstall horovod -y - bash /install/install_horovod.sh $HOROVOD_VERSION +# only test and run horovod on GPU +if [ "$TF_NEED_CUDA" -ne 0 ]; then + # Lack of HorovodJoin CPU kernels when install Horovod with NCCL + if [ "$(uname)" != "Darwin" ]; then + # Mac only with MPI + python -m pip uninstall horovod -y + bash /install/install_horovod.sh $HOROVOD_VERSION --only-cpu + fi + # TODO(jamesrong): Test on GPU. + CUDA_VISIBLE_DEVICES="" mpirun -np 2 -H localhost:2 --allow-run-as-root pytest -v ./tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/horovod_sync_train_test.py + # Reinstall Horovod after tests + if [ "$(uname)" != "Darwin" ]; then + # Mac only with MPI + python -m pip uninstall horovod -y + bash /install/install_horovod.sh $HOROVOD_VERSION + fi fi IGNORE_HKV=""