From d2f750665238663cafe6bb02e1cdaacbeb98b89a Mon Sep 17 00:00:00 2001 From: Julian Qian Date: Tue, 3 Sep 2024 13:33:00 -0700 Subject: [PATCH] tensorboard --- .../movielens-1m-keras-with-horovod.py | 15 +++++++++++++-- .../{seq_and_dense.py.py => seq_and_dense.py} | 1 + demo/dynamic_embedding/seq_and_dense/start.sh | 1 + 3 files changed, 15 insertions(+), 2 deletions(-) rename demo/dynamic_embedding/seq_and_dense/{seq_and_dense.py.py => seq_and_dense.py} (99%) diff --git a/demo/dynamic_embedding/movielens-1m-keras-with-horovod/movielens-1m-keras-with-horovod.py b/demo/dynamic_embedding/movielens-1m-keras-with-horovod/movielens-1m-keras-with-horovod.py index a52b585c..44a2c11b 100644 --- a/demo/dynamic_embedding/movielens-1m-keras-with-horovod/movielens-1m-keras-with-horovod.py +++ b/demo/dynamic_embedding/movielens-1m-keras-with-horovod/movielens-1m-keras-with-horovod.py @@ -67,6 +67,7 @@ def get_rank() -> int: 'Embedding size for users and movies') flags.DEFINE_integer('test_steps', 128, 'test steps.') flags.DEFINE_integer('test_batch', 1024, 'test batch size.') +flags.DEFINE_integer('profiles', 10, 'number of profiles') flags.DEFINE_bool('shuffle', True, 'shuffle dataset.') FLAGS = flags.FLAGS @@ -638,7 +639,6 @@ def train(): if os.path.exists(FLAGS.model_dir + '/variables'): model.load_weights(FLAGS.model_dir) - tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=FLAGS.model_dir) save_options = tf.saved_model.SaveOptions(namespace_whitelist=['TFRA']) ckpt_callback = de.keras.callbacks.ModelCheckpoint( filepath=FLAGS.model_dir + '/weights_epoch{epoch:03d}_loss{loss:.4f}', @@ -651,9 +651,20 @@ def train(): else: callbacks_list = [ckpt_callback] + def get_profiling_batch(total_batches: int, num_profiles: int): + interval = total_batches // num_profiles + if interval == 0: + return None + else: + return ((i + 1) * interval for i in range(num_profiles)) + # The log class callback only takes effect in rank0 for convenience if get_rank() == 0: - callbacks_list.extend([tensorboard_callback]) + profile_batch = get_profiling_batch(FLAGS.steps_per_epoch, FLAGS.profiles) + if profile_batch: + tensorboard_callback = tf.keras.callbacks.TensorBoard( + log_dir="logs/profile", update_freq=100, profile_batch=(50, 100)) + callbacks_list.extend([tensorboard_callback]) # If there are callbacks such as evaluation metrics that call model calculations, take effect on all ranks. # callbacks_list.extend([my_auc_callback]) diff --git a/demo/dynamic_embedding/seq_and_dense/seq_and_dense.py.py b/demo/dynamic_embedding/seq_and_dense/seq_and_dense.py similarity index 99% rename from demo/dynamic_embedding/seq_and_dense/seq_and_dense.py.py rename to demo/dynamic_embedding/seq_and_dense/seq_and_dense.py index 91de1231..e4e84a5c 100644 --- a/demo/dynamic_embedding/seq_and_dense/seq_and_dense.py.py +++ b/demo/dynamic_embedding/seq_and_dense/seq_and_dense.py @@ -22,6 +22,7 @@ # optimal performance os.environ['TF_XLA_FLAGS'] = '--tf_xla_auto_jit=2 --tf_xla_cpu_global_jit' +tf.config.optimizer.set_jit(True) def has_horovod() -> bool: return 'OMPI_COMM_WORLD_RANK' in os.environ or 'PMI_RANK' in os.environ diff --git a/demo/dynamic_embedding/seq_and_dense/start.sh b/demo/dynamic_embedding/seq_and_dense/start.sh index 37b94abd..4cb89511 100644 --- a/demo/dynamic_embedding/seq_and_dense/start.sh +++ b/demo/dynamic_embedding/seq_and_dense/start.sh @@ -2,5 +2,6 @@ rm -rf ./export_dir gpu_num=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) export gpu_num +export TF_XLA_FLAGS=--tf_xla_auto_jit=2 horovodrun -np $gpu_num python seq_and_dense.py --mode="train" --model_dir="./model_dir" --export_dir="./export_dir" \ --steps_per_epoch=${1:-20000} --shuffle=${2:-True} \ No newline at end of file