Skip to content

Commit

Permalink
Exclude repo_id from LoadDataFromFileSystem (#898)
Browse files Browse the repository at this point in the history
* Exclude repo_id from LoadDataFromFileSystem generator class and update tests

* Update code to be compatible with python 3.9
  • Loading branch information
plaguss authored Aug 13, 2024
1 parent 7ff4d20 commit 04d0bf0
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 11 deletions.
14 changes: 11 additions & 3 deletions src/distilabel/steps/generators/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,15 @@
from pathlib import Path
from typing import (
TYPE_CHECKING,
Annotated,
Any,
Dict,
List,
Mapping,
Optional,
Sequence,
Tuple,
TypeVar,
Union,
)

Expand All @@ -46,6 +48,13 @@
from distilabel.steps.typing import GeneratorStepOutput


T = TypeVar("T")

# To avoid using repo_id in LoadDataFromFileSystem:
# https://github.com/pydantic/pydantic/discussions/7076#discussioncomment-6699138
ExcludedField = Annotated[T, Field(exclude=True)]


class LoadDataFromHub(GeneratorStep):
"""Loads a dataset from the Hugging Face Hub.
Expand Down Expand Up @@ -334,6 +343,7 @@ class LoadDataFromFileSystem(LoadDataFromHub):
default=None,
description="The expected filetype. If not provided, it will be inferred from the file extension.",
)
repo_id: ExcludedField[Union[str, None]] = None

def load(self) -> None:
"""Load the dataset from the file/s in disk."""
Expand Down Expand Up @@ -416,9 +426,7 @@ def outputs(self) -> List[str]:
"""
# We assume there are Dataset/IterableDataset, not it's ...Dict counterparts
if self._dataset is None:
raise ValueError(
"Dataset not loaded yet, you must call `load` method first."
)
self.load()

return self._dataset.column_names

Expand Down
23 changes: 15 additions & 8 deletions tests/unit/steps/generators/test_huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
LoadDataFromHub,
)

from tests.unit.pipeline.utils import DummyStep1

DISTILABEL_RUN_SLOW_TESTS = os.getenv("DISTILABEL_RUN_SLOW_TESTS", False)


Expand Down Expand Up @@ -133,18 +135,23 @@ def test_read_from_jsonl_with_nested_folder(
assert isinstance(generator_step_output[1], bool)
assert len(generator_step_output[0]) == 22

@pytest.mark.parametrize("load", [True, False])
def test_outputs(self, load: bool) -> None:
def test_outputs(self) -> None:
loader = LoadDataFromFileSystem(
filetype="json",
data_files=str(Path(__file__).parent / "sample_functions.jsonl"),
)
if load:
loader.load()
assert loader.outputs == ["type", "function"]
else:
with pytest.raises(ValueError):
loader.outputs # noqa: B018
loader.load()
assert loader.outputs == ["type", "function"]

def test_loading_in_pipeline(self):
with Pipeline():
loader = LoadDataFromFileSystem(
filetype="json",
data_files=str(Path(__file__).parent / "sample_functions.jsonl"),
)
dummy = DummyStep1(input_mappings={"instruction": "function"})
loader >> dummy
assert loader.outputs == ["type", "function"]


class TestLoadDataFromDisk:
Expand Down

0 comments on commit 04d0bf0

Please sign in to comment.