Skip to content

Commit

Permalink
Add docstrings for the new steps
Browse files Browse the repository at this point in the history
  • Loading branch information
plaguss committed Jun 4, 2024
1 parent d582085 commit 23ce193
Showing 1 changed file with 67 additions and 15 deletions.
82 changes: 67 additions & 15 deletions src/distilabel/steps/generators/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ class LoadFromHub(GeneratorStep):
`False`.
- `num_examples`: The number of examples to load from the dataset.
By default will load all examples.
- `storage_options`: Key/value pairs to be passed on to the file-system backend, if any.
Defaults to `None`.
Output columns:
- dynamic (`all`): The columns that will be generated by this step, based on the
Expand Down Expand Up @@ -264,13 +266,40 @@ def _get_hf_dataset_info(


class LoadFromFileSystem(LoadFromHub):
"""Loads a dataset from a file in disk.
"""Loads a dataset from a file in your filesystem.
Take a look at [Hugging Face Datasets](https://huggingface.co/docs/datasets/loading)
`GeneratorStep` that creates a dataset from a file in the filesystem, uses Hugging Face `datasets`
library. Take a look at [Hugging Face Datasets](https://huggingface.co/docs/datasets/loading)
for more information of the supported file types.
Attributes:
data_files: The path to the file, or directory containing the files that conform
the dataset.
split: The split of the dataset to load (typically will be `train`, `test` or `validation`).
Runtime parameters:
- `batch_size`: The batch size to use when processing the data.
- `data_files`: The path to the file, or directory containing the files that conform
the dataset.
- `split`: The split of the dataset to load. Defaults to 'train'.
- `streaming`: Whether to load the dataset in streaming mode or not. Defaults to
`False`.
- `num_examples`: The number of examples to load from the dataset.
By default will load all examples.
- `storage_options`: Key/value pairs to be passed on to the file-system backend, if any.
Defaults to `None`.
- `filetype`: The expected filetype. If not provided, it will be inferred from the file extension.
For more than one file, it will be inferred from the first file.
Output columns:
- dynamic (`all`): The columns that will be generated by this step, based on the
datasets loaded from the Hugging Face Hub.
Categories:
- load
"""

data_files: Union[str, Path] = Field(
data_files: RuntimeParameter[Union[str, Path]] = Field(
default=None,
description="The data files, or directory containing the data files, to generate the dataset from.",
)
Expand Down Expand Up @@ -352,8 +381,7 @@ def get_filetype(data_path: UPath) -> str:

@property
def outputs(self) -> List[str]:
"""
The columns that will be generated by this step, based on the datasets from a file
"""The columns that will be generated by this step, based on the datasets from a file
in disk.
Returns:
Expand All @@ -373,24 +401,50 @@ class LoadFromDisk(LoadFromHub):
If you previously saved your dataset using the `save_to_disk` method, or
`Distiset.save_to_disk` you can load it again to build a new pipeline using this class.
Attributes:
dataset_path: The path to the dataset or distiset.
split: The split of the dataset to load (typically will be `train`, `test` or `validation`).
config: The configuration of the dataset to load. This is optional and only needed
if the dataset has multiple configurations.
Runtime parameters:
- `batch_size`: The batch size to use when processing the data.
- `dataset_path`: The path to the dataset or distiset.
- `is_distiset`: Whether the dataset to load is a `Distiset` or not. Defaults to False.
- `split`: The split of the dataset to load. Defaults to 'train'.
- `config`: The configuration of the dataset to load. This is optional and only
needed if the dataset has multiple configurations.
- `num_examples`: The number of examples to load from the dataset.
By default will load all examples.
- `storage_options`: Key/value pairs to be passed on to the file-system backend, if any.
Defaults to `None`.
Output columns:
- dynamic (`all`): The columns that will be generated by this step, based on the
datasets loaded from the Hugging Face Hub.
Categories:
- load
"""

dataset_path: Union[str, Path] = Field(
dataset_path: RuntimeParameter[Union[str, Path]] = Field(
default=None,
description="_summary_",
)
is_distiset: Optional[RuntimeParameter[bool]] = Field(
default=False,
description="_summary_",
)
config: RuntimeParameter[str] = Field(
default=None,
description="The configuration of the dataset to load. This is optional and only"
" needed if the dataset has multiple configurations.",
)

is_distiset: Optional[RuntimeParameter[bool]] = Field(
default=False,
description="Whether the dataset to load is a `Distiset` or not. Defaults to False.",
)
keep_in_memory: Optional[RuntimeParameter[bool]] = Field(
default=None, description="_summary_"
default=None,
description="Whether to copy the dataset in-memory, see `datasets.Dataset.load_from_disk` "
" for more information. Defaults to `None`.",
)
split: Optional[RuntimeParameter[str]] = Field(
default=None,
Expand Down Expand Up @@ -428,14 +482,12 @@ def load(self) -> None:

@property
def outputs(self) -> List[str]:
"""
The columns that will be generated by this step, based on the datasets from a file
"""The columns that will be generated by this step, based on the datasets from a file
in disk.
Returns:
The columns that will be generated by this step.
"""
raise NotImplementedError("Method not implemented yet.")
# We assume there are Dataset/IterableDataset, not it's ...Dict counterparts
if self._dataset is Ellipsis:
raise ValueError(
Expand Down

0 comments on commit 23ce193

Please sign in to comment.