diff --git a/docs/consolidated_metadata.rst b/docs/guide/consolidated_metadata.rst similarity index 100% rename from docs/consolidated_metadata.rst rename to docs/guide/consolidated_metadata.rst diff --git a/docs/guide/index.rst b/docs/guide/index.rst new file mode 100644 index 000000000..106c35ce8 --- /dev/null +++ b/docs/guide/index.rst @@ -0,0 +1,7 @@ +Guide +===== + +.. toctree:: + :maxdepth: 1 + + consolidated_metadata diff --git a/docs/index.rst b/docs/index.rst index ebb1b2a24..6b90b5a77 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -10,7 +10,7 @@ Zarr-Python getting_started tutorial - consolidated_metadata + guide/index api/index spec release diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 46fd1cb80..0f50264be 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -36,7 +36,7 @@ async def _decode_single( chunk_numpy_array = ensure_ndarray(chunk_bytes.as_array_like()) # ensure correct dtype - if str(chunk_numpy_array.dtype) != chunk_spec.dtype: + if str(chunk_numpy_array.dtype) != chunk_spec.dtype and not chunk_spec.dtype.hasobject: chunk_numpy_array = chunk_numpy_array.view(chunk_spec.dtype) return get_ndbuffer_class().from_numpy_array(chunk_numpy_array) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 5195db62a..ca405842e 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -35,6 +35,7 @@ ShapeLike, ZarrFormat, concurrent_map, + parse_dtype, parse_shapelike, product, ) @@ -365,16 +366,17 @@ async def create( ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: store_path = await make_store_path(store) + dtype_parsed = parse_dtype(dtype, zarr_format) shape = parse_shapelike(shape) if chunks is not None and chunk_shape is not None: raise ValueError("Only one of chunk_shape or chunks can be provided.") - dtype = np.dtype(dtype) if chunks: - _chunks = normalize_chunks(chunks, shape, dtype.itemsize) + _chunks = normalize_chunks(chunks, shape, dtype_parsed.itemsize) else: - _chunks = normalize_chunks(chunk_shape, shape, dtype.itemsize) + _chunks = normalize_chunks(chunk_shape, shape, dtype_parsed.itemsize) + result: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] if zarr_format == 3: if dimension_separator is not None: @@ -396,7 +398,7 @@ async def create( result = await cls._create_v3( store_path, shape=shape, - dtype=dtype, + dtype=dtype_parsed, chunk_shape=_chunks, fill_value=fill_value, chunk_key_encoding=chunk_key_encoding, @@ -406,6 +408,14 @@ async def create( exists_ok=exists_ok, ) elif zarr_format == 2: + if dtype is str or dtype == "str": + # another special case: zarr v2 added the vlen-utf8 codec + vlen_codec: dict[str, JSON] = {"id": "vlen-utf8"} + if filters and not any(x["id"] == "vlen-utf8" for x in filters): + filters = list(filters) + [vlen_codec] + else: + filters = [vlen_codec] + if codecs is not None: raise ValueError( "codecs cannot be used for arrays with version 2. Use filters and compressor instead." @@ -419,7 +429,7 @@ async def create( result = await cls._create_v2( store_path, shape=shape, - dtype=dtype, + dtype=dtype_parsed, chunks=_chunks, dimension_separator=dimension_separator, fill_value=fill_value, diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index efce0f98f..0bc6245cb 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -14,6 +14,10 @@ overload, ) +import numpy as np + +from zarr.core.strings import _STRING_DTYPE + if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Iterator @@ -151,3 +155,13 @@ def parse_order(data: Any) -> Literal["C", "F"]: if data in ("C", "F"): return cast(Literal["C", "F"], data) raise ValueError(f"Expected one of ('C', 'F'), got {data} instead.") + + +def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> np.dtype[Any]: + if dtype is str or dtype == "str": + if zarr_format == 2: + # special case as object + return np.dtype("object") + else: + return _STRING_DTYPE + return np.dtype(dtype) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index d1dd86880..c5f34d277 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -321,7 +321,7 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any: """ if dtype.kind == "S": return b"" - elif dtype.kind == "U": + elif dtype.kind in "UO": return "" else: return dtype.type(0) diff --git a/src/zarr/storage/local.py b/src/zarr/storage/local.py index 1e1f3beec..da37cbfd5 100644 --- a/src/zarr/storage/local.py +++ b/src/zarr/storage/local.py @@ -94,6 +94,11 @@ def __init__(self, root: Path | str, *, mode: AccessModeLiteral = "r") -> None: assert isinstance(root, Path) self.root = root + async def _open(self) -> None: + if not self.mode.readonly: + self.root.mkdir(parents=True, exist_ok=True) + return await super()._open() + async def clear(self) -> None: self._check_writable() shutil.rmtree(self.root) diff --git a/tests/v3/test_codecs/test_vlen.py b/tests/v3/test_codecs/test_vlen.py index ca5ccb92f..aaea5dab8 100644 --- a/tests/v3/test_codecs/test_vlen.py +++ b/tests/v3/test_codecs/test_vlen.py @@ -11,7 +11,7 @@ from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING from zarr.storage.common import StorePath -numpy_str_dtypes: list[type | str | None] = [None, str, np.dtypes.StrDType] +numpy_str_dtypes: list[type | str | None] = [None, str, "str", np.dtypes.StrDType] expected_zarr_string_dtype: np.dtype[Any] if _NUMPY_SUPPORTS_VLEN_STRING: numpy_str_dtypes.append(np.dtypes.StringDType) diff --git a/tests/v3/test_store/test_local.py b/tests/v3/test_store/test_local.py index 8cdb4e874..5352e3520 100644 --- a/tests/v3/test_store/test_local.py +++ b/tests/v3/test_store/test_local.py @@ -1,11 +1,17 @@ from __future__ import annotations +from typing import TYPE_CHECKING + import pytest +import zarr from zarr.core.buffer import Buffer, cpu from zarr.storage.local import LocalStore from zarr.testing.store import StoreTests +if TYPE_CHECKING: + import pathlib + class TestLocalStore(StoreTests[LocalStore, cpu.Buffer]): store_cls = LocalStore @@ -40,3 +46,10 @@ async def test_empty_with_empty_subdir(self, store: LocalStore) -> None: assert await store.empty() (store.root / "foo/bar").mkdir(parents=True) assert await store.empty() + + def test_creates_new_directory(self, tmp_path: pathlib.Path): + target = tmp_path.joinpath("a", "b", "c") + assert not target.exists() + + store = self.store_cls(root=target, mode="w") + zarr.group(store=store) diff --git a/tests/v3/test_v2.py b/tests/v3/test_v2.py index d981fbc89..729ed0533 100644 --- a/tests/v3/test_v2.py +++ b/tests/v3/test_v2.py @@ -2,6 +2,7 @@ from collections.abc import Iterator from typing import Any +import numcodecs.vlen import numpy as np import pytest from numcodecs import Delta @@ -44,7 +45,7 @@ def test_simple(store: StorePath) -> None: ("float64", 0.0), ("|S1", b""), ("|U1", ""), - ("object", 0), + ("object", ""), (str, ""), ], ) @@ -53,7 +54,12 @@ def test_implicit_fill_value(store: StorePath, dtype: str, fill_value: Any) -> N assert arr.metadata.fill_value is None assert arr.metadata.to_dict()["fill_value"] is None result = arr[:] - expected = np.full(arr.shape, fill_value, dtype=dtype) + if dtype is str: + # special case + numpy_dtype = np.dtype(object) + else: + numpy_dtype = np.dtype(dtype) + expected = np.full(arr.shape, fill_value, dtype=numpy_dtype) np.testing.assert_array_equal(result, expected) @@ -106,3 +112,14 @@ async def test_v2_encode_decode(dtype): data = zarr.open_array(store=store, path="foo")[:] expected = np.full((3,), b"X", dtype=dtype) np.testing.assert_equal(data, expected) + + +@pytest.mark.parametrize("dtype", [str, "str"]) +async def test_create_dtype_str(dtype: Any) -> None: + arr = zarr.create(shape=3, dtype=dtype, zarr_format=2) + assert arr.dtype.kind == "O" + assert arr.metadata.to_dict()["dtype"] == "|O" + assert arr.metadata.filters == (numcodecs.vlen.VLenUTF8(),) + arr[:] = ["a", "bb", "ccc"] + result = arr[:] + np.testing.assert_array_equal(result, np.array(["a", "bb", "ccc"], dtype="object"))