From 04d0bf0a8b3a27d07bfa3278b8ce2ecdf3fa1590 Mon Sep 17 00:00:00 2001 From: Agus Date: Tue, 13 Aug 2024 20:31:32 +0200 Subject: [PATCH] Exclude `repo_id` from `LoadDataFromFileSystem` (#898) * Exclude repo_id from LoadDataFromFileSystem generator class and update tests * Update code to be compatible with python 3.9 --- .../steps/generators/huggingface.py | 14 ++++++++--- .../unit/steps/generators/test_huggingface.py | 23 ++++++++++++------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/src/distilabel/steps/generators/huggingface.py b/src/distilabel/steps/generators/huggingface.py index b31b9fbadc..b2add099b1 100644 --- a/src/distilabel/steps/generators/huggingface.py +++ b/src/distilabel/steps/generators/huggingface.py @@ -17,6 +17,7 @@ from pathlib import Path from typing import ( TYPE_CHECKING, + Annotated, Any, Dict, List, @@ -24,6 +25,7 @@ Optional, Sequence, Tuple, + TypeVar, Union, ) @@ -46,6 +48,13 @@ from distilabel.steps.typing import GeneratorStepOutput +T = TypeVar("T") + +# To avoid using repo_id in LoadDataFromFileSystem: +# https://github.com/pydantic/pydantic/discussions/7076#discussioncomment-6699138 +ExcludedField = Annotated[T, Field(exclude=True)] + + class LoadDataFromHub(GeneratorStep): """Loads a dataset from the Hugging Face Hub. @@ -334,6 +343,7 @@ class LoadDataFromFileSystem(LoadDataFromHub): default=None, description="The expected filetype. If not provided, it will be inferred from the file extension.", ) + repo_id: ExcludedField[Union[str, None]] = None def load(self) -> None: """Load the dataset from the file/s in disk.""" @@ -416,9 +426,7 @@ def outputs(self) -> List[str]: """ # We assume there are Dataset/IterableDataset, not it's ...Dict counterparts if self._dataset is None: - raise ValueError( - "Dataset not loaded yet, you must call `load` method first." - ) + self.load() return self._dataset.column_names diff --git a/tests/unit/steps/generators/test_huggingface.py b/tests/unit/steps/generators/test_huggingface.py index 280053c8cb..f1dd55a450 100644 --- a/tests/unit/steps/generators/test_huggingface.py +++ b/tests/unit/steps/generators/test_huggingface.py @@ -27,6 +27,8 @@ LoadDataFromHub, ) +from tests.unit.pipeline.utils import DummyStep1 + DISTILABEL_RUN_SLOW_TESTS = os.getenv("DISTILABEL_RUN_SLOW_TESTS", False) @@ -133,18 +135,23 @@ def test_read_from_jsonl_with_nested_folder( assert isinstance(generator_step_output[1], bool) assert len(generator_step_output[0]) == 22 - @pytest.mark.parametrize("load", [True, False]) - def test_outputs(self, load: bool) -> None: + def test_outputs(self) -> None: loader = LoadDataFromFileSystem( filetype="json", data_files=str(Path(__file__).parent / "sample_functions.jsonl"), ) - if load: - loader.load() - assert loader.outputs == ["type", "function"] - else: - with pytest.raises(ValueError): - loader.outputs # noqa: B018 + loader.load() + assert loader.outputs == ["type", "function"] + + def test_loading_in_pipeline(self): + with Pipeline(): + loader = LoadDataFromFileSystem( + filetype="json", + data_files=str(Path(__file__).parent / "sample_functions.jsonl"), + ) + dummy = DummyStep1(input_mappings={"instruction": "function"}) + loader >> dummy + assert loader.outputs == ["type", "function"] class TestLoadDataFromDisk: