You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I am unable to restore the data iterator checkpoint
The stack trace is below
Traceback (most recent call last):
File "/home/quan/code/monopi/monopi/model/scripts/train.py", line 660, in <module>
register_cfg.cli_with_selectable_config(main)
File "/home/quan/code/monopi/monopi/model/configs/register_config.py", line 3509, in cli_with_selectable_config
return tyro.cli(f, args=sys.argv[3:])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/quan/micromamba/envs/monopi/lib/python3.11/site-packages/tyro/_cli.py", line 217, in cli
return run_with_args_from_cli()
^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/quan/code/monopi/monopi/model/scripts/train.py", line 339, in main
train_state, train_iter, val_iter, mixture, dataset_specs, special_processors = init_train_state_and_dataloader(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/quan/code/monopi/monopi/model/scripts/train.py", line 203, in init_train_state_and_dataloader
train_iter, val_iter, mixture, specs, special_processors = init_dataloader(
^^^^^^^^^^^^^^^^
File "/home/quan/code/monopi/monopi/model/scripts/train.py", line 155, in init_dataloader
train_iter, val_iter, mixture, specs, special_processors = dali_iterator.create_dali_iterators(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/quan/code/monopi/monopi/model/data/dali_iterator.py", line 383, in create_dali_iterators
train_dali_iterator = train_data_iterator_fn(
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/quan/micromamba/envs/monopi/lib/python3.11/site-packages/nvidia/dali/plugin/jax/iterator.py", line 346, in create_iterator
return iterator_type(
^^^^^^^^^^^^^^
File "/home/quan/micromamba/envs/monopi/lib/python3.11/site-packages/nvidia/dali/plugin/jax/iterator.py", line 137, in __init__
_DaliBaseIterator.__init__(
File "/home/quan/micromamba/envs/monopi/lib/python3.11/site-packages/nvidia/dali/plugin/base_iterator.py", line 215, in __init__
p.build()
File "/home/quan/micromamba/envs/monopi/lib/python3.11/site-packages/nvidia/dali/pipeline.py", line 1037, in build
self._restore_state_from_checkpoint()
File "/home/quan/micromamba/envs/monopi/lib/python3.11/site-packages/nvidia/dali/pipeline.py", line 1000, in _restore_state_from_checkpoint
external_ctx_cpt = self._pipe.RestoreFromSerializedCheckpoint(self._checkpoint)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Assert on "op" failed: The executor doesn't recognize "ExternalSource" as a name of an operator.
The checkpoint might come from another pipeline.
Traceback (most recent call last):
File "/home/quan/code/monopi/monopi/model/scripts/train.py", line 660, in <module>
register_cfg.cli_with_selectable_config(main)
File "/home/quan/code/monopi/monopi/model/configs/register_config.py", line 3509, in cli_with_selectable_config
return tyro.cli(f, args=sys.argv[3:])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/quan/micromamba/envs/monopi/lib/python3.11/site-packages/tyro/_cli.py", line 217, in cli
return run_with_args_from_cli()
^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/quan/code/monopi/monopi/model/scripts/train.py", line 339, in main
train_state, train_iter, val_iter, mixture, dataset_specs, special_processors = init_train_state_and_dataloader(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/quan/code/monopi/monopi/model/scripts/train.py", line 203, in init_train_state_and_dataloader
train_iter, val_iter, mixture, specs, special_processors = init_dataloader(
^^^^^^^^^^^^^^^^
File "/home/quan/code/monopi/monopi/model/scripts/train.py", line 155, in init_dataloader
train_iter, val_iter, mixture, specs, special_processors = dali_iterator.create_dali_iterators(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/quan/code/monopi/monopi/model/data/dali_iterator.py", line 383, in create_dali_iterators
train_dali_iterator = train_data_iterator_fn(
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/quan/micromamba/envs/monopi/lib/python3.11/site-packages/nvidia/dali/plugin/jax/iterator.py", line 346, in create_iterator
return iterator_type(
^^^^^^^^^^^^^^
File "/home/quan/micromamba/envs/monopi/lib/python3.11/site-packages/nvidia/dali/plugin/jax/iterator.py", line 137, in __init__
_DaliBaseIterator.__init__(
File "/home/quan/micromamba/envs/monopi/lib/python3.11/site-packages/nvidia/dali/plugin/base_iterator.py", line 215, in __init__
p.build()
File "/home/quan/micromamba/envs/monopi/lib/python3.11/site-packages/nvidia/dali/pipeline.py", line 1037, in build
self._restore_state_from_checkpoint()
File "/home/quan/micromamba/envs/monopi/lib/python3.11/site-packages/nvidia/dali/pipeline.py", line 1000, in _restore_state_from_checkpoint
external_ctx_cpt = self._pipe.RestoreFromSerializedCheckpoint(self._checkpoint)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Assert on "op" failed: The executor doesn't recognize "ExternalSource" as a name of an operator.
Minimum reproducible example
No response
Relevant log output
No response
Other/Misc.
No response
Check for duplicates
I have searched the open bugs/issues and have found no duplicates for this bug report
The text was updated successfully, but these errors were encountered:
Hello @quanvuong , thank you for reporting the problem. Can you please share the code that causes this error? This would allow us to reproduce it and investigate.
Version
1.40.0
Describe the bug.
I am unable to restore the data iterator checkpoint
The stack trace is below
Minimum reproducible example
No response
Relevant log output
No response
Other/Misc.
No response
Check for duplicates
The text was updated successfully, but these errors were encountered: