Add docstrings for the new steps

argilla-io · Jun 4, 2024 · 23ce193 · 23ce193
1 parent d582085
commit 23ce193
Showing 1 changed file with 67 additions and 15 deletions.
diff --git a/src/distilabel/steps/generators/huggingface.py b/src/distilabel/steps/generators/huggingface.py
@@ -65,6 +65,8 @@ class LoadFromHub(GeneratorStep):
             `False`.
         - `num_examples`: The number of examples to load from the dataset.
             By default will load all examples.
+        - `storage_options`: Key/value pairs to be passed on to the file-system backend, if any.
+            Defaults to `None`.
 
     Output columns:
         - dynamic (`all`): The columns that will be generated by this step, based on the
@@ -264,13 +266,40 @@ def _get_hf_dataset_info(
 
 
 class LoadFromFileSystem(LoadFromHub):
-    """Loads a dataset from a file in disk.
+    """Loads a dataset from a file in your filesystem.
 
-    Take a look at [Hugging Face Datasets](https://huggingface.co/docs/datasets/loading)
+    `GeneratorStep` that creates a dataset from a file in the filesystem, uses Hugging Face `datasets`
+    library. Take a look at [Hugging Face Datasets](https://huggingface.co/docs/datasets/loading)
     for more information of the supported file types.
+
+    Attributes:
+        data_files: The path to the file, or directory containing the files that conform
+            the dataset.
+        split: The split of the dataset to load (typically will be `train`, `test` or `validation`).
+
+    Runtime parameters:
+        - `batch_size`: The batch size to use when processing the data.
+        - `data_files`: The path to the file, or directory containing the files that conform
+            the dataset.
+        - `split`: The split of the dataset to load. Defaults to 'train'.
+        - `streaming`: Whether to load the dataset in streaming mode or not. Defaults to
+            `False`.
+        - `num_examples`: The number of examples to load from the dataset.
+            By default will load all examples.
+        - `storage_options`: Key/value pairs to be passed on to the file-system backend, if any.
+            Defaults to `None`.
+        - `filetype`: The expected filetype. If not provided, it will be inferred from the file extension.
+            For more than one file, it will be inferred from the first file.
+
+    Output columns:
+        - dynamic (`all`): The columns that will be generated by this step, based on the
+            datasets loaded from the Hugging Face Hub.
+
+    Categories:
+        - load
     """
 
-    data_files: Union[str, Path] = Field(
+    data_files: RuntimeParameter[Union[str, Path]] = Field(
         default=None,
         description="The data files, or directory containing the data files, to generate the dataset from.",
     )
@@ -352,8 +381,7 @@ def get_filetype(data_path: UPath) -> str:
 
     @property
     def outputs(self) -> List[str]:
-        """
-        The columns that will be generated by this step, based on the datasets from a file
+        """The columns that will be generated by this step, based on the datasets from a file
         in disk.
 
         Returns:
@@ -373,24 +401,50 @@ class LoadFromDisk(LoadFromHub):
 
     If you previously saved your dataset using the `save_to_disk` method, or
     `Distiset.save_to_disk` you can load it again to build a new pipeline using this class.
+
+    Attributes:
+        dataset_path: The path to the dataset or distiset.
+        split: The split of the dataset to load (typically will be `train`, `test` or `validation`).
+        config: The configuration of the dataset to load. This is optional and only needed
+            if the dataset has multiple configurations.
+
+    Runtime parameters:
+        - `batch_size`: The batch size to use when processing the data.
+        - `dataset_path`: The path to the dataset or distiset.
+        - `is_distiset`: Whether the dataset to load is a `Distiset` or not. Defaults to False.
+        - `split`: The split of the dataset to load. Defaults to 'train'.
+        - `config`: The configuration of the dataset to load. This is optional and only
+            needed if the dataset has multiple configurations.
+        - `num_examples`: The number of examples to load from the dataset.
+            By default will load all examples.
+        - `storage_options`: Key/value pairs to be passed on to the file-system backend, if any.
+            Defaults to `None`.
+
+    Output columns:
+        - dynamic (`all`): The columns that will be generated by this step, based on the
+            datasets loaded from the Hugging Face Hub.
+
+    Categories:
+        - load
     """
 
-    dataset_path: Union[str, Path] = Field(
+    dataset_path: RuntimeParameter[Union[str, Path]] = Field(
         default=None,
         description="_summary_",
     )
-    is_distiset: Optional[RuntimeParameter[bool]] = Field(
-        default=False,
-        description="_summary_",
-    )
     config: RuntimeParameter[str] = Field(
         default=None,
         description="The configuration of the dataset to load. This is optional and only"
         " needed if the dataset has multiple configurations.",
     )
-
+    is_distiset: Optional[RuntimeParameter[bool]] = Field(
+        default=False,
+        description="Whether the dataset to load is a `Distiset` or not. Defaults to False.",
+    )
     keep_in_memory: Optional[RuntimeParameter[bool]] = Field(
-        default=None, description="_summary_"
+        default=None,
+        description="Whether to copy the dataset in-memory, see `datasets.Dataset.load_from_disk` "
+        " for more information. Defaults to `None`.",
     )
     split: Optional[RuntimeParameter[str]] = Field(
         default=None,
@@ -428,14 +482,12 @@ def load(self) -> None:
 
     @property
     def outputs(self) -> List[str]:
-        """
-        The columns that will be generated by this step, based on the datasets from a file
+        """The columns that will be generated by this step, based on the datasets from a file
         in disk.
 
         Returns:
             The columns that will be generated by this step.
         """
-        raise NotImplementedError("Method not implemented yet.")
         # We assume there are Dataset/IterableDataset, not it's ...Dict counterparts
         if self._dataset is Ellipsis:
             raise ValueError(