From ad16e9bc75620ecb326921b54522f858354f8431 Mon Sep 17 00:00:00 2001 From: svittoz <137794505+svittoz@users.noreply.github.com> Date: Fri, 5 Apr 2024 10:19:08 +0200 Subject: [PATCH] fix: normalize path (#283) --- changelog.md | 5 +++++ edsnlp/utils/file_system.py | 17 +++++++---------- tests/data/test_parquet.py | 3 ++- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/changelog.md b/changelog.md index 3f6e163c4..201add09b 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,10 @@ # Changelog +## Unreleased + +### Fixed +- Fix `edsnlp.utils.file_system.normalize_fs_path` file system detection not working correctly + ## v0.11.1 (2024-04-02) ### Added diff --git a/edsnlp/utils/file_system.py b/edsnlp/utils/file_system.py index 59f63f1db..1aaee106b 100644 --- a/edsnlp/utils/file_system.py +++ b/edsnlp/utils/file_system.py @@ -42,24 +42,21 @@ def normalize_fs_path( filesystem: Optional[FileSystem], path: Union[str, Path], ) -> Tuple[AbstractFileSystem, str]: - path = str(path) + has_protocol = isinstance(path, str) and "://" in path - if filesystem is None or (isinstance(path, str) and "://" in path): - path = ( - os.path.abspath(path) - if isinstance(path, Path) or "://" in path - else f"file://{os.path.abspath(path)}" - ) - inferred_fs, fs_path = pyarrow.fs.FileSystem.from_uri(path) + # We need to detect the fs from the path + if filesystem is None or has_protocol: + uri: str = path if has_protocol else f"file://{os.path.abspath(path)}" + inferred_fs, fs_path = pyarrow.fs.FileSystem.from_uri(uri) filesystem = filesystem or inferred_fs assert inferred_fs.type_name == filesystem.type_name, ( f"Protocol {inferred_fs.type_name} in path does not match " f"filesystem {filesystem.type_name}" ) - path = fs_path + path = fs_path # path without protocol return ( ArrowFSWrapper(filesystem) if isinstance(filesystem, pyarrow.fs.FileSystem) else filesystem - ), path + ), str(path) diff --git a/tests/data/test_parquet.py b/tests/data/test_parquet.py index 7d3217e0d..9c6c6f47c 100644 --- a/tests/data/test_parquet.py +++ b/tests/data/test_parquet.py @@ -1,3 +1,4 @@ +import os from pathlib import Path import pyarrow.dataset @@ -242,7 +243,7 @@ def test_read_to_parquet(blank_nlp, tmpdir): fs = pyarrow.fs.LocalFileSystem() doc = list( edsnlp.data.read_parquet( - input_dir, + input_dir.relative_to(os.getcwd()), converter="omop", span_attributes=["etat", "assertion"], doc_attributes=["context_var"],