Skip to content

Commit

Permalink
fix: normalize path (#283)
Browse files Browse the repository at this point in the history
  • Loading branch information
svittoz authored Apr 5, 2024
1 parent 165bb2c commit ad16e9b
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 11 deletions.
5 changes: 5 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Changelog

## Unreleased

### Fixed
- Fix `edsnlp.utils.file_system.normalize_fs_path` file system detection not working correctly

## v0.11.1 (2024-04-02)

### Added
Expand Down
17 changes: 7 additions & 10 deletions edsnlp/utils/file_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,24 +42,21 @@ def normalize_fs_path(
filesystem: Optional[FileSystem],
path: Union[str, Path],
) -> Tuple[AbstractFileSystem, str]:
path = str(path)
has_protocol = isinstance(path, str) and "://" in path

if filesystem is None or (isinstance(path, str) and "://" in path):
path = (
os.path.abspath(path)
if isinstance(path, Path) or "://" in path
else f"file://{os.path.abspath(path)}"
)
inferred_fs, fs_path = pyarrow.fs.FileSystem.from_uri(path)
# We need to detect the fs from the path
if filesystem is None or has_protocol:
uri: str = path if has_protocol else f"file://{os.path.abspath(path)}"
inferred_fs, fs_path = pyarrow.fs.FileSystem.from_uri(uri)
filesystem = filesystem or inferred_fs
assert inferred_fs.type_name == filesystem.type_name, (
f"Protocol {inferred_fs.type_name} in path does not match "
f"filesystem {filesystem.type_name}"
)
path = fs_path
path = fs_path # path without protocol

return (
ArrowFSWrapper(filesystem)
if isinstance(filesystem, pyarrow.fs.FileSystem)
else filesystem
), path
), str(path)
3 changes: 2 additions & 1 deletion tests/data/test_parquet.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from pathlib import Path

import pyarrow.dataset
Expand Down Expand Up @@ -242,7 +243,7 @@ def test_read_to_parquet(blank_nlp, tmpdir):
fs = pyarrow.fs.LocalFileSystem()
doc = list(
edsnlp.data.read_parquet(
input_dir,
input_dir.relative_to(os.getcwd()),
converter="omop",
span_attributes=["etat", "assertion"],
doc_attributes=["context_var"],
Expand Down

0 comments on commit ad16e9b

Please sign in to comment.