Skip to content

Commit

Permalink
only test hvd on GPU
Browse files Browse the repository at this point in the history
  • Loading branch information
jq committed Aug 22, 2024
1 parent eaed4ec commit e3d8882
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 30 deletions.
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
import pytest
from tensorflow.python.framework import test_util

from tensorflow_recommenders_addons.dynamic_embedding.python.keras.callbacks import \
DEHvdBroadcastGlobalVariablesCallback


@pytest.fixture
def broadcast_callback(root_rank=0, device='/gpu:0'):
# Instantiate with the corrected parameters
return DEHvdBroadcastGlobalVariablesCallback(root_rank=root_rank,
device=device)


def test_on_batch_end_subsequent_calls(broadcast_callback):
broadcast_callback.broadcast_done = True
broadcast_callback.on_batch_end(1)
def test_on_batch_end_subsequent_calls():
if test_util.is_gpu_available():
broadcast_callback = DEHvdBroadcastGlobalVariablesCallback(root_rank=0,
device='/gpu:0')
broadcast_callback.broadcast_done = True
broadcast_callback.on_batch_end(1)
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,10 @@
from tensorflow.python.framework import test_util
from tensorflow.python.framework.errors_impl import NotFoundError
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import variables
from tensorflow.python.platform import test
from tensorflow.python.training import adam
from tensorflow.python.training import checkpoint_management
from tensorflow.python.training import monitored_session
from tensorflow.python.training.optimizer import Optimizer as tf1_opt
from tensorflow.python.training import training_util
try:
from tensorflow.keras.optimizers.legacy import Adam
Expand Down Expand Up @@ -77,7 +75,8 @@ def test_adam_minimize_trainable(self):
self.skipTest(
"Apple silicon devices don't support synchronous training based on Horovod."
)

if not is_gpu_available:
self.skipTest('Only test when gpu is available.')
base_opt = adam.AdamOptimizer(1.0)
test_opt = adam.AdamOptimizer(1.0)
self.common_minimize_trainable_v1(base_opt, test_opt, name="adam")
Expand Down
4 changes: 0 additions & 4 deletions tools/docker/cpu_tests.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ FROM python:3.9 as build_wheel
ARG TF_VERSION="2.15.1"
ARG PY_VERSION="3.9"
ARG MPI_VERSION="4.1.1"
ARG HOROVOD_VERSION="0.28.1"

RUN pip install --upgrade pip
RUN pip install --default-timeout=1000 tensorflow==$TF_VERSION
Expand All @@ -16,9 +15,6 @@ RUN apt-get update && apt-get install -y sudo rsync cmake openmpi-bin libopenmpi
COPY tools/install_deps/install_bazelisk.sh /install/
RUN bash /install/install_bazelisk.sh

COPY tools/docker/install/install_horovod.sh /install/
RUN /install/install_horovod.sh $HOROVOD_VERSION --only-cpu

COPY requirements.txt ./
RUN pip install -r requirements.txt

Expand Down
29 changes: 16 additions & 13 deletions tools/testing/build_and_run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,19 +49,22 @@ if ! [ -x "$(command -v nvidia-smi)" ]; then
EXTRA_ARGS="-n auto"
fi

# Lack of HorovodJoin CPU kernels when install Horovod with NCCL
if [ "$(uname)" != "Darwin" ]; then
# Mac only with MPI
python -m pip uninstall horovod -y
bash /install/install_horovod.sh $HOROVOD_VERSION --only-cpu
fi
# TODO(jamesrong): Test on GPU.
CUDA_VISIBLE_DEVICES="" mpirun -np 2 -H localhost:2 --allow-run-as-root pytest -v ./tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/horovod_sync_train_test.py
# Reinstall Horovod after tests
if [ "$(uname)" != "Darwin" ]; then
# Mac only with MPI
python -m pip uninstall horovod -y
bash /install/install_horovod.sh $HOROVOD_VERSION
# only test and run horovod on GPU
if [ "$TF_NEED_CUDA" -ne 0 ]; then
# Lack of HorovodJoin CPU kernels when install Horovod with NCCL
if [ "$(uname)" != "Darwin" ]; then
# Mac only with MPI
python -m pip uninstall horovod -y
bash /install/install_horovod.sh $HOROVOD_VERSION --only-cpu
fi
# TODO(jamesrong): Test on GPU.
CUDA_VISIBLE_DEVICES="" mpirun -np 2 -H localhost:2 --allow-run-as-root pytest -v ./tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/horovod_sync_train_test.py
# Reinstall Horovod after tests
if [ "$(uname)" != "Darwin" ]; then
# Mac only with MPI
python -m pip uninstall horovod -y
bash /install/install_horovod.sh $HOROVOD_VERSION
fi
fi

IGNORE_HKV=""
Expand Down

0 comments on commit e3d8882

Please sign in to comment.