Add HDF5 support for trajs and model_devis (#259)

## Summary by CodeRabbit - **New Features** - Introduced new optional arguments for improved data handling and multitasking capabilities. - Added support for HDF5 formatted data in various modules. - Enhanced flexibility in input handling for multiple data formats. - **Bug Fixes** - Improved robustness in handling validation data structures. - **Documentation** - Updated documentation to clarify new parameters and their intended use.  --------- Signed-off-by: zjgemi <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
deepmodeling · Sep 10, 2024 · 3501db4 · 3501db4
1 parent ce4ab3e
commit 3501db4
Show file tree

Hide file tree

Showing 14 changed files with 116 additions and 39 deletions.
diff --git a/dpgen2/entrypoint/args.py b/dpgen2/entrypoint/args.py
@@ -359,6 +359,7 @@ def run_diffcsp_args():
     doc_gen_tasks = "Number of DiffCSP generation tasks"
     doc_gen_command = "Command for DiffCSP generation"
     doc_relax_group_size = "Group size for relaxation"
+    doc_use_hdf5 = "Use HDF5 to store trajs and model_devis"
     return [
         Argument(
             "gen_tasks",
@@ -380,6 +381,13 @@ def run_diffcsp_args():
             default=100,
             doc=doc_relax_group_size,
         ),
+        Argument(
+            "use_hdf5",
+            bool,
+            optional=True,
+            default=False,
+            doc=doc_use_hdf5,
+        ),
     ]
 
 

diff --git a/dpgen2/entrypoint/submit.py b/dpgen2/entrypoint/submit.py
@@ -111,6 +111,7 @@
     RunDPTrain,
     RunLmp,
     RunRelax,
+    RunRelaxHDF5,
     SelectConfs,
 )
 from dpgen2.op.caly_evo_step_merge import (
@@ -167,6 +168,7 @@ def make_concurrent_learning_op(
     upload_python_packages: Optional[List[os.PathLike]] = None,
     valid_data: Optional[S3Artifact] = None,
     train_optional_files: Optional[List[str]] = None,
+    explore_config: Optional[dict] = None,
 ):
     if train_style in ("dp", "dp-dist"):
         prep_run_train_op = PrepRunDPTrain(
@@ -234,7 +236,7 @@ def make_concurrent_learning_op(
             "prep-run-diffcsp",
             DiffCSPGen,
             PrepRelax,
-            RunRelax,
+            RunRelaxHDF5 if explore_config["use_hdf5"] else RunRelax,  # type: ignore
             prep_config=prep_explore_config,
             run_config=run_explore_config,
             upload_python_packages=upload_python_packages,
@@ -552,6 +554,7 @@ def workflow_concurrent_learning(
         upload_python_packages=upload_python_packages,
         valid_data=valid_data,
         train_optional_files=train_optional_files,
+        explore_config=explore_config,
     )
     scheduler = make_naive_exploration_scheduler(config)
 

diff --git a/dpgen2/exploration/render/traj_render.py b/dpgen2/exploration/render/traj_render.py
@@ -15,6 +15,9 @@
 
 import dpdata
 import numpy as np
+from dflow.python.opio import (
+    HDF5Dataset,
+)
 
 from ..deviation import (
     DeviManager,
@@ -30,7 +33,7 @@ class TrajRender(ABC):
     @abstractmethod
     def get_model_devi(
         self,
-        files: List[Path],
+        files: Union[List[Path], List[HDF5Dataset]],
     ) -> DeviManager:
         r"""Get model deviations from recording files.
 
@@ -48,7 +51,7 @@ def get_model_devi(
     @abstractmethod
     def get_confs(
         self,
-        traj: List[Path],
+        traj: Union[List[Path], List[HDF5Dataset]],
         id_selected: List[List[int]],
         type_map: Optional[List[str]] = None,
         conf_filters: Optional["ConfFilters"] = None,

diff --git a/dpgen2/exploration/render/traj_render_lammps.py b/dpgen2/exploration/render/traj_render_lammps.py
@@ -1,4 +1,7 @@
 import json
+from io import (
+    StringIO,
+)
 from pathlib import (
     Path,
 )
@@ -12,6 +15,9 @@
 
 import dpdata
 import numpy as np
+from dflow.python.opio import (
+    HDF5Dataset,
+)
 
 from dpgen2.utils import (
     setup_ele_temp,
@@ -42,7 +48,7 @@ def __init__(
 
     def get_model_devi(
         self,
-        files: List[Path],
+        files: Union[List[Path], List[HDF5Dataset]],
     ) -> DeviManager:
         ntraj = len(files)
 
@@ -53,7 +59,10 @@ def get_model_devi(
         return model_devi
 
     def _load_one_model_devi(self, fname, model_devi):
-        dd = np.loadtxt(fname)
+        if isinstance(fname, HDF5Dataset):
+            dd = fname.get_data()
+        else:
+            dd = np.loadtxt(fname)
         if len(np.shape(dd)) == 1:  # In case model-devi.out is 1-dimensional
             dd = dd.reshape((1, len(dd)))
 
@@ -92,7 +101,7 @@ def set_ele_temp(self, system, ele_temp):
 
     def get_confs(
         self,
-        trajs: List[Path],
+        trajs: Union[List[Path], List[HDF5Dataset]],
         id_selected: List[List[int]],
         type_map: Optional[List[str]] = None,
         conf_filters: Optional["ConfFilters"] = None,
@@ -108,7 +117,11 @@ def get_confs(
         ms = dpdata.MultiSystems(type_map=type_map)
         for ii in range(ntraj):
             if len(id_selected[ii]) > 0:
-                ss = dpdata.System(trajs[ii], fmt=traj_fmt, type_map=type_map)
+                if isinstance(trajs[ii], HDF5Dataset):
+                    traj = StringIO(trajs[ii].get_data())  # type: ignore
+                else:
+                    traj = trajs[ii]
+                ss = dpdata.System(traj, fmt=traj_fmt, type_map=type_map)
                 ss.nopbc = self.nopbc
                 if ele_temp:
                     self.set_ele_temp(ss, ele_temp[ii])

diff --git a/dpgen2/exploration/scheduler/convergence_check_stage_scheduler.py b/dpgen2/exploration/scheduler/convergence_check_stage_scheduler.py
@@ -5,11 +5,15 @@
     List,
     Optional,
     Tuple,
+    Union,
 )
 
 from dflow.python import (
     FatalError,
 )
+from dflow.python.opio import (
+    HDF5Dataset,
+)
 
 from dpgen2.exploration.report import (
     ExplorationReport,
@@ -67,7 +71,7 @@ def reached_max_iteration(self):
     def plan_next_iteration(
         self,
         report: Optional[ExplorationReport] = None,
-        trajs: Optional[List[Path]] = None,
+        trajs: Optional[Union[List[Path], List[HDF5Dataset]]] = None,
     ) -> Tuple[bool, Optional[BaseExplorationTaskGroup], Optional[ConfSelector]]:
         if self.complete():
             raise FatalError("Cannot plan because the stage has completed.")

diff --git a/dpgen2/exploration/scheduler/scheduler.py b/dpgen2/exploration/scheduler/scheduler.py
@@ -5,12 +5,16 @@
     List,
     Optional,
     Tuple,
+    Union,
 )
 
 import numpy as np
 from dflow.python import (
     FatalError,
 )
+from dflow.python.opio import (
+    HDF5Dataset,
+)
 
 from dpgen2.exploration.report import (
     ExplorationReport,
@@ -110,7 +114,7 @@ def force_stage_complete(self):
     def plan_next_iteration(
         self,
         report: Optional[ExplorationReport] = None,
-        trajs: Optional[List[Path]] = None,
+        trajs: Optional[Union[List[Path], List[HDF5Dataset]]] = None,
     ) -> Tuple[bool, Optional[ExplorationTaskGroup], Optional[ConfSelector]]:
         """
         Make the plan for the next DPGEN iteration.
@@ -119,7 +123,7 @@ def plan_next_iteration(
         ----------
         report : ExplorationReport
             The exploration report of this iteration.
-        trajs : List[Path]
+        trajs : Union[List[Path], List[HDF5Dataset]]
             A list of configurations generated during the exploration. May be used to generate new configurations for the next iteration.
 
         Returns

diff --git a/dpgen2/exploration/scheduler/stage_scheduler.py b/dpgen2/exploration/scheduler/stage_scheduler.py
@@ -8,6 +8,11 @@
 from typing import (
     List,
     Tuple,
+    Union,
+)
+
+from dflow.python.opio import (
+    HDF5Dataset,
 )
 
 from dpgen2.exploration.report import (
@@ -87,7 +92,7 @@ def get_reports(self) -> List[ExplorationReport]:
     def plan_next_iteration(
         self,
         report: ExplorationReport,
-        trajs: List[Path],
+        trajs: Union[List[Path], List[HDF5Dataset]],
     ) -> Tuple[bool, ExplorationTaskGroup, ConfSelector]:
         """
         Make the plan for the next iteration of the stage.
@@ -96,11 +101,9 @@ def plan_next_iteration(
 
         Parameters
         ----------
-        hist_reports : List[ExplorationReport]
-            The historical exploration report of the stage. If this is the first iteration of the stage, this list is empty.
         report : ExplorationReport
             The exploration report of this iteration.
-        confs : List[Path]
+        trajs : Union[List[Path], List[HDF5Dataset]]
             A list of configurations generated during the exploration. May be used to generate new configurations for the next iteration.
 
         Returns

diff --git a/dpgen2/exploration/selector/conf_selector.py b/dpgen2/exploration/selector/conf_selector.py
@@ -10,9 +10,13 @@
     Optional,
     Set,
     Tuple,
+    Union,
 )
 
 import dpdata
+from dflow.python.opio import (
+    HDF5Dataset,
+)
 
 from dpgen2.exploration.report import (
     ExplorationReport,
@@ -29,8 +33,8 @@ class ConfSelector(ABC):
     @abstractmethod
     def select(
         self,
-        trajs: List[Path],
-        model_devis: List[Path],
+        trajs: Union[List[Path], List[HDF5Dataset]],
+        model_devis: Union[List[Path], List[HDF5Dataset]],
         type_map: Optional[List[str]] = None,
         optional_outputs: Optional[List[Path]] = None,
     ) -> Tuple[List[Path], ExplorationReport]:

diff --git a/dpgen2/exploration/selector/conf_selector_frame.py b/dpgen2/exploration/selector/conf_selector_frame.py
@@ -9,10 +9,14 @@
     List,
     Optional,
     Tuple,
+    Union,
 )
 
 import dpdata
 import numpy as np
+from dflow.python.opio import (
+    HDF5Dataset,
+)
 
 from dpgen2.exploration.render import (
     TrajRender,
@@ -52,8 +56,8 @@ def __init__(
 
     def select(
         self,
-        trajs: List[Path],
-        model_devis: List[Path],
+        trajs: Union[List[Path], List[HDF5Dataset]],
+        model_devis: Union[List[Path], List[HDF5Dataset]],
         type_map: Optional[List[str]] = None,
         optional_outputs: Optional[List[Path]] = None,
     ) -> Tuple[List[Path], ExplorationReport]:

diff --git a/dpgen2/flow/dpgen_loop.py b/dpgen2/flow/dpgen_loop.py
@@ -9,6 +9,7 @@
 from typing import (
     List,
     Optional,
+    Union,
 )
 
 import jsonpickle
@@ -35,6 +36,7 @@
     OPIO,
     Artifact,
     BigParameter,
+    HDF5Datasets,
     OPIOSign,
     PythonOPTemplate,
     Slices,
@@ -91,7 +93,7 @@ def get_input_sign(cls):
             {
                 "exploration_scheduler": BigParameter(ExplorationScheduler),
                 "exploration_report": BigParameter(ExplorationReport),
-                "trajs": Artifact(List[Path]),
+                "trajs": Artifact(Union[List[Path], HDF5Datasets]),
             }
         )
 

diff --git a/dpgen2/op/__init__.py b/dpgen2/op/__init__.py
@@ -39,6 +39,7 @@
 )
 from .run_relax import (
     RunRelax,
+    RunRelaxHDF5,
 )
 from .select_confs import (
     SelectConfs,