Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix/normalize storage paths #2384

Merged
merged 11 commits into from
Oct 16, 2024
33 changes: 9 additions & 24 deletions src/zarr/api/asynchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,10 +170,7 @@ async def consolidate_metadata(
The group, with the ``consolidated_metadata`` field set to include
the metadata of each child node.
"""
store_path = await make_store_path(store)

if path is not None:
store_path = store_path / path
store_path = await make_store_path(store, path=path)

group = await AsyncGroup.open(store_path, zarr_format=zarr_format, use_consolidated=False)
group.store_path.store._check_writable()
Expand Down Expand Up @@ -291,10 +288,7 @@ async def open(
"""
zarr_format = _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format)

store_path = await make_store_path(store, mode=mode, storage_options=storage_options)

if path is not None:
store_path = store_path / path
store_path = await make_store_path(store, mode=mode, path=path, storage_options=storage_options)

if "shape" not in kwargs and mode in {"a", "w", "w-"}:
try:
Expand Down Expand Up @@ -401,9 +395,7 @@ async def save_array(
)

mode = kwargs.pop("mode", None)
store_path = await make_store_path(store, mode=mode, storage_options=storage_options)
if path is not None:
store_path = store_path / path
store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options)
new = await AsyncArray.create(
store_path,
zarr_format=zarr_format,
Expand Down Expand Up @@ -582,9 +574,7 @@ async def group(

mode = None if isinstance(store, Store) else cast(AccessModeLiteral, "a")

store_path = await make_store_path(store, mode=mode, storage_options=storage_options)
if path is not None:
store_path = store_path / path
store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options)

if chunk_store is not None:
warnings.warn("chunk_store is not yet implemented", RuntimeWarning, stacklevel=2)
Expand Down Expand Up @@ -697,9 +687,7 @@ async def open_group(
if chunk_store is not None:
warnings.warn("chunk_store is not yet implemented", RuntimeWarning, stacklevel=2)

store_path = await make_store_path(store, mode=mode, storage_options=storage_options)
if path is not None:
store_path = store_path / path
store_path = await make_store_path(store, mode=mode, storage_options=storage_options, path=path)

if attributes is None:
attributes = {}
Expand Down Expand Up @@ -883,9 +871,7 @@ async def create(
if not isinstance(store, Store | StorePath):
mode = "a"

store_path = await make_store_path(store, mode=mode, storage_options=storage_options)
if path is not None:
store_path = store_path / path
store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options)

return await AsyncArray.create(
store_path,
Expand Down Expand Up @@ -925,6 +911,7 @@ async def empty(
retrieve data from an empty Zarr array, any values may be returned,
and these are not guaranteed to be stable from one access to the next.
"""

return await create(shape=shape, fill_value=None, **kwargs)


Expand Down Expand Up @@ -1044,7 +1031,7 @@ async def open_array(
store: StoreLike | None = None,
zarr_version: ZarrFormat | None = None, # deprecated
zarr_format: ZarrFormat | None = None,
path: PathLike | None = None,
path: PathLike = "",
storage_options: dict[str, Any] | None = None,
**kwargs: Any, # TODO: type kwargs as valid args to save
) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]:
Expand All @@ -1071,9 +1058,7 @@ async def open_array(
"""

mode = kwargs.pop("mode", None)
store_path = await make_store_path(store, mode=mode)
if path is not None:
store_path = store_path / path
store_path = await make_store_path(store, path=path, mode=mode)

zarr_format = _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format)

Expand Down
38 changes: 38 additions & 0 deletions src/zarr/storage/_utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,49 @@
from __future__ import annotations

import re
from pathlib import Path
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from zarr.core.buffer import Buffer


def normalize_path(path: str | bytes | Path | None) -> str:
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
if path is None:
result = ""
elif isinstance(path, bytes):
result = str(path, "ascii")

# handle pathlib.Path
elif isinstance(path, Path):
result = str(path)

elif isinstance(path, str):
result = path

else:
raise TypeError(f'Object {path} has an invalid type for "path": {type(path).__name__}')

# convert backslash to forward slash
result = result.replace("\\", "/")

# remove leading and trailing slashes
result = result.strip("/")

# collapse any repeated slashes
d-v-b marked this conversation as resolved.
Show resolved Hide resolved
pat = re.compile(r"//+")
result = pat.sub("/", result)

# disallow path segments with just '.' or '..'
segments = result.split("/")
if any(s in {".", ".."} for s in segments):
raise ValueError(
f"The path {path!r} is invalid because its string representation contains '.' or '..' segments."
)

return result


def _normalize_interval_index(
data: Buffer, interval: None | tuple[int | None, int | None]
) -> tuple[int, int]:
Expand Down
34 changes: 23 additions & 11 deletions src/zarr/storage/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from zarr.core.buffer import Buffer, default_buffer_prototype
from zarr.core.common import ZARR_JSON, ZARRAY_JSON, ZGROUP_JSON, ZarrFormat
from zarr.errors import ContainsArrayAndGroupError, ContainsArrayError, ContainsGroupError
from zarr.storage._utils import normalize_path
from zarr.storage.local import LocalStore
from zarr.storage.memory import MemoryStore

Expand Down Expand Up @@ -41,9 +42,9 @@ class StorePath:
store: Store
path: str

def __init__(self, store: Store, path: str | None = None) -> None:
def __init__(self, store: Store, path: str = "") -> None:
self.store = store
self.path = path or ""
self.path = path

async def get(
self,
Expand Down Expand Up @@ -159,6 +160,7 @@ def __eq__(self, other: object) -> bool:
async def make_store_path(
store_like: StoreLike | None,
*,
path: str | None = "",
mode: AccessModeLiteral | None = None,
storage_options: dict[str, Any] | None = None,
) -> StorePath:
Expand Down Expand Up @@ -189,6 +191,9 @@ async def make_store_path(
----------
store_like : StoreLike | None
The object to convert to a `StorePath` object.
path: str | None, optional
The path to use when creating the `StorePath` object. If None, the
default path is the empty string.
mode : AccessModeLiteral | None, optional
The mode to use when creating the `StorePath` object. If None, the
default mode is 'r'.
Expand All @@ -209,37 +214,44 @@ async def make_store_path(
from zarr.storage.remote import RemoteStore # circular import

used_storage_options = False

path_normalized = normalize_path(path)
if isinstance(store_like, StorePath):
if mode is not None and mode != store_like.store.mode.str:
_store = store_like.store.with_mode(mode)
await _store._ensure_open()
store_like = StorePath(_store)
result = store_like
store_like = StorePath(_store, path=store_like.path)
result = store_like / path_normalized
elif isinstance(store_like, Store):
if mode is not None and mode != store_like.mode.str:
store_like = store_like.with_mode(mode)
await store_like._ensure_open()
result = StorePath(store_like)
result = StorePath(store_like, path=path_normalized)
elif store_like is None:
# mode = "w" is an exception to the default mode = 'r'
result = StorePath(await MemoryStore.open(mode=mode or "w"))
result = StorePath(await MemoryStore.open(mode=mode or "w"), path=path_normalized)
elif isinstance(store_like, Path):
result = StorePath(await LocalStore.open(root=store_like, mode=mode or "r"))
result = StorePath(
await LocalStore.open(root=store_like, mode=mode or "r"), path=path_normalized
)
elif isinstance(store_like, str):
storage_options = storage_options or {}

if _is_fsspec_uri(store_like):
used_storage_options = True
result = StorePath(
RemoteStore.from_url(store_like, storage_options=storage_options, mode=mode or "r")
RemoteStore.from_url(store_like, storage_options=storage_options, mode=mode or "r"),
path=path_normalized,
)
else:
result = StorePath(await LocalStore.open(root=Path(store_like), mode=mode or "r"))
result = StorePath(
await LocalStore.open(root=Path(store_like), mode=mode or "r"), path=path_normalized
)
elif isinstance(store_like, dict):
# We deliberate only consider dict[str, Buffer] here, and not arbitrary mutable mappings.
# By only allowing dictionaries, which are in-memory, we know that MemoryStore appropriate.
result = StorePath(await MemoryStore.open(store_dict=store_like, mode=mode or "r"))
result = StorePath(
await MemoryStore.open(store_dict=store_like, mode=mode or "r"), path=path_normalized
)
else:
msg = f"Unsupported type for store_like: '{type(store_like).__name__}'" # type: ignore[unreachable]
raise TypeError(msg)
Expand Down
5 changes: 4 additions & 1 deletion src/zarr/storage/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,10 @@ def __init__(self, root: Path | str, *, mode: AccessModeLiteral = "r") -> None:
super().__init__(mode=mode)
if isinstance(root, str):
root = Path(root)
assert isinstance(root, Path)
if not isinstance(root, Path):
raise TypeError(
f'"root" must be a string or Path instance. Got an object with type {type(root)} instead.'
)
self.root = root

async def _open(self) -> None:
Expand Down
2 changes: 1 addition & 1 deletion tests/v3/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def array_fixture(request: pytest.FixtureRequest) -> npt.NDArray[Any]:
)


@pytest.fixture(params=(2, 3))
@pytest.fixture(params=(2, 3), ids=["zarr2", "zarr3"])
def zarr_format(request: pytest.FixtureRequest) -> ZarrFormat:
if request.param == 2:
return 2
Expand Down
27 changes: 26 additions & 1 deletion tests/v3/test_api.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pathlib
import warnings
from typing import Literal

import numpy as np
import pytest
Expand All @@ -10,9 +11,19 @@
import zarr.core.group
from zarr import Array, Group
from zarr.abc.store import Store
from zarr.api.synchronous import create, group, load, open, open_group, save, save_array, save_group
from zarr.api.synchronous import (
create,
group,
load,
open,
open_group,
save,
save_array,
save_group,
)
from zarr.core.common import ZarrFormat
from zarr.errors import MetadataValidationError
from zarr.storage._utils import normalize_path
from zarr.storage.memory import MemoryStore


Expand All @@ -37,6 +48,20 @@ def test_create_array(memory_store: Store) -> None:
assert z.chunks == (40,)


@pytest.mark.parametrize("path", ["foo", "/", "/foo", "///foo/bar"])
@pytest.mark.parametrize("node_type", ["array", "group"])
def test_open_normalized_path(
memory_store: MemoryStore, path: str, node_type: Literal["array", "group"]
) -> None:
node: Group | Array
if node_type == "group":
node = group(store=memory_store, path=path)
elif node_type == "array":
node = create(store=memory_store, path=path, shape=(2,))

assert node.path == normalize_path(path)


async def test_open_array(memory_store: MemoryStore) -> None:
store = memory_store

Expand Down
14 changes: 4 additions & 10 deletions tests/v3/test_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import contextlib
import pickle
import warnings
from typing import TYPE_CHECKING, Any, Literal, cast
from typing import TYPE_CHECKING, Any, Literal

import numpy as np
import pytest
Expand All @@ -14,7 +14,6 @@
from zarr import Array, AsyncArray, AsyncGroup, Group
from zarr.abc.store import Store
from zarr.core.buffer import default_buffer_prototype
from zarr.core.common import JSON, ZarrFormat
from zarr.core.group import ConsolidatedMetadata, GroupMetadata
from zarr.core.sync import sync
from zarr.errors import ContainsArrayError, ContainsGroupError
Expand All @@ -26,6 +25,8 @@
if TYPE_CHECKING:
from _pytest.compat import LEGACY_PATH

from zarr.core.common import JSON, ZarrFormat


@pytest.fixture(params=["local", "memory", "zip"])
async def store(request: pytest.FixtureRequest, tmpdir: LEGACY_PATH) -> Store:
Expand All @@ -43,14 +44,6 @@ def exists_ok(request: pytest.FixtureRequest) -> bool:
return result


@pytest.fixture(params=[2, 3], ids=["zarr2", "zarr3"])
def zarr_format(request: pytest.FixtureRequest) -> ZarrFormat:
result = request.param
if result not in (2, 3):
raise ValueError("Wrong value returned from test fixture.")
return cast(ZarrFormat, result)


def test_group_init(store: Store, zarr_format: ZarrFormat) -> None:
"""
Test that initializing a group from an asyncgroup works.
Expand Down Expand Up @@ -587,6 +580,7 @@ def test_group_array_creation(
assert empty_array.fill_value == 0
assert empty_array.shape == shape
assert empty_array.store_path.store == store
assert empty_array.store_path.path == "empty"

empty_like_array = group.empty_like(name="empty_like", data=empty_array)
assert isinstance(empty_like_array, Array)
Expand Down
Loading