developmentseed · vincentsarago · Nov 4, 2024 · Oct 31, 2024 · Nov 4, 2024 · Nov 4, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -51,7 +51,7 @@ jobs:
 
       - name: Test titiler.xarray
         run: |
-          python -m pip install -e src/titiler/xarray["test"]
+          python -m pip install -e src/titiler/xarray["test,all"]
           python -m pytest src/titiler/xarray --cov=titiler.xarray --cov-report=xml --cov-append --cov-report=term-missing
 
       - name: Test titiler.mosaic

diff --git a/src/titiler/xarray/pyproject.toml b/src/titiler/xarray/pyproject.toml
@@ -30,24 +30,31 @@ classifiers = [
 dynamic = ["version"]
 dependencies = [
     "titiler.core==0.19.0.dev",
-    "cftime",
-    "h5netcdf",
     "xarray",
     "rioxarray",
-    "zarr",
     "fsspec",
-    "s3fs",
-    "aiohttp",
-    "pandas",
-    "httpx",
+    "zarr",
+    "h5netcdf",
-    "zarr",
-    "h5netcdf",
-    "zarr",
-    "h5netcdf",
+    "cftime",
 ]
 
 [project.optional-dependencies]
+s3 = [
+    "s3fs",
+]
+http = [
+    "aiohttp",
+]
+all = [
+    "s3fs",
+    "aiohttp",
+]
-all = [
-    "s3fs",
-    "aiohttp",
-]
+zarr = [
+    "zarr",
+]
+hdf5 = [
+    "h5netcdf",
+]
+all = [
+    "s3fs",
+    "aiohttp",
+    "zarr",
+    "hdf5",
+]
-all = [
-    "s3fs",
-    "aiohttp",
-]
+zarr = [
+    "zarr",
+]
+hdf5 = [
+    "h5netcdf",
+]
+all = [
+    "s3fs",
+    "aiohttp",
+    "zarr",
+    "hdf5",
+]
 test = [
     "pytest",
     "pytest-cov",
     "pytest-asyncio",
     "httpx",
+    "kerchunk",
 ]
 
 [project.urls]

diff --git a/src/titiler/xarray/tests/fixtures/generate_fixtures.ipynb b/src/titiler/xarray/tests/fixtures/generate_fixtures.ipynb
@@ -138,6 +138,30 @@
     "    ds.to_zarr(store=f\"pyramid.zarr\", mode=\"w\", group=ix)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import fsspec\n",
+    "from kerchunk.hdf import SingleHdf5ToZarr\n",
+    "\n",
+    "with fsspec.open(\"dataset_3d.nc\", mode=\"rb\", anon=True) as infile:\n",
+    "    h5chunks = SingleHdf5ToZarr(infile, \"dataset_3d.nc\", inline_threshold=100)\n",
+    "\n",
+    "    with open(\"reference.json\", 'w') as f:\n",
+    "        f.write(json.dumps(h5chunks.translate()));\n"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/src/titiler/xarray/tests/fixtures/reference.json b/src/titiler/xarray/tests/fixtures/reference.json
@@ -0,0 +1 @@
+{"version": 1, "refs": {".zgroup": "{\"zarr_format\":2}", "dataset/.zarray": "{\"chunks\":[1,500,1000],\"compressor\":null,\"dtype\":\"<f8\",\"fill_value\":\"NaN\",\"filters\":[{\"elementsize\":8,\"id\":\"shuffle\"},{\"id\":\"zlib\",\"level\":9}],\"order\":\"C\",\"shape\":[2,1000,2000],\"zarr_format\":2}", "dataset/.zattrs": "{\"_ARRAY_DIMENSIONS\":[\"time\",\"y\",\"x\"],\"fill_value\":0,\"valid_max\":1000.0,\"valid_min\":1.0}", "dataset/0.0.0": ["tests/fixtures/dataset_3d.nc", 37134, 113251], "dataset/0.0.1": ["tests/fixtures/dataset_3d.nc", 150385, 112805], "dataset/0.1.0": ["tests/fixtures/dataset_3d.nc", 263190, 65106], "dataset/0.1.1": ["tests/fixtures/dataset_3d.nc", 328296, 65049], "dataset/1.0.0": ["tests/fixtures/dataset_3d.nc", 393345, 65468], "dataset/1.0.1": ["tests/fixtures/dataset_3d.nc", 458813, 65506], "dataset/1.1.0": ["tests/fixtures/dataset_3d.nc", 524319, 58101], "dataset/1.1.1": ["tests/fixtures/dataset_3d.nc", 582420, 58075], "time/.zarray": "{\"chunks\":[2],\"compressor\":null,\"dtype\":\"<i8\",\"fill_value\":null,\"filters\":null,\"order\":\"C\",\"shape\":[2],\"zarr_format\":2}", "time/.zattrs": "{\"_ARRAY_DIMENSIONS\":[\"time\"],\"calendar\":\"proleptic_gregorian\",\"units\":\"days since 2022-01-01 00:00:00\"}", "time/0": "\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000m\u0001\u0000\u0000\u0000\u0000\u0000\u0000", "x/.zarray": "{\"chunks\":[2000],\"compressor\":null,\"dtype\":\"<f8\",\"fill_value\":\"NaN\",\"filters\":null,\"order\":\"C\",\"shape\":[2000],\"zarr_format\":2}", "x/.zattrs": "{\"_ARRAY_DIMENSIONS\":[\"x\"]}", "x/0": ["tests/fixtures/dataset_3d.nc", 1415, 16000], "y/.zarray": "{\"chunks\":[1000],\"compressor\":null,\"dtype\":\"<f8\",\"fill_value\":\"NaN\",\"filters\":null,\"order\":\"C\",\"shape\":[1000],\"zarr_format\":2}", "y/.zattrs": "{\"_ARRAY_DIMENSIONS\":[\"y\"]}", "y/0": ["tests/fixtures/dataset_3d.nc", 17570, 8000]}}
diff --git a/src/titiler/xarray/tests/test_io_tools.py b/src/titiler/xarray/tests/test_io_tools.py
@@ -1,11 +1,14 @@
 """test titiler.xarray.io utility functions."""
 
+import json
 import os
 from datetime import datetime
 
+import fsspec
 import numpy
 import pytest
 import xarray
+from kerchunk.hdf import SingleHdf5ToZarr
 
 from titiler.xarray.io import Reader, get_variable
 
@@ -109,12 +112,19 @@ def test_get_variable():
 
 
 @pytest.mark.parametrize(
-    "filename",
-    ["dataset_2d.nc", "dataset_3d.nc", "dataset_3d.zarr"],
+    "protocol,filename",
+    [
+        ("file://", "dataset_2d.nc"),
+        ("file://", "dataset_3d.nc"),
+        ("file://", "dataset_3d.zarr"),
+        ("", "dataset_2d.nc"),
+        ("", "dataset_3d.nc"),
+        ("", "dataset_3d.zarr"),
+    ],
 )
-def test_reader(filename):
+def test_reader(protocol, filename):
     """test reader."""
-    src_path = os.path.join(prefix, filename)
+    src_path = protocol + os.path.join(protocol, prefix, filename)
     assert Reader.list_variables(src_path) == ["dataset"]
 
     with Reader(src_path, variable="dataset") as src:
@@ -134,3 +144,27 @@ def test_zarr_group(group):
         assert src.info()
         assert src.tile(0, 0, 0)
         assert src.point(0, 0).data[0] == group * 2
+
+
+def test_kerchunk_reference(tmp_path):
+    """test Kerchunk reference."""
+    d = tmp_path / "ref"
+    d.mkdir()
+
+    netcdf = os.path.join(prefix, "dataset_3d.nc")
+    reference = os.path.join(
+        str(d),
+        "reference.json",
+    )
+
+    with fsspec.open(netcdf, mode="rb", anon=True) as infile:
+        h5chunks = SingleHdf5ToZarr(infile, netcdf, inline_threshold=100)
+        with open(reference, "w") as f:
+            f.write(json.dumps(h5chunks.translate()))
+
+    for protocol in ["", "reference://"]:
+        src_path = protocol + reference
+        assert Reader.list_variables(src_path) == ["dataset"]
+        with Reader(src_path, variable="dataset") as src:
+            assert src.info()
+            assert src.tile(0, 0, 0)
diff --git a/src/titiler/xarray/titiler/xarray/dependencies.py b/src/titiler/xarray/titiler/xarray/dependencies.py
@@ -22,14 +22,6 @@ class XarrayIOParams(DefaultDependency):
         ),
     ] = None
 
-    reference: Annotated[
-        Optional[bool],
-        Query(
-            title="reference",
-            description="Whether the dataset is a kerchunk reference",
-        ),
-    ] = None
-
     decode_times: Annotated[
         Optional[bool],
         Query(
@@ -38,14 +30,6 @@ class XarrayIOParams(DefaultDependency):
         ),
     ] = None
 
-    consolidated: Annotated[
-        Optional[bool],
-        Query(
-            title="consolidated",
-            description="Whether to expect and open zarr store with consolidated metadata",
-        ),
-    ] = None
-
     # cache_client
 
 

diff --git a/src/titiler/xarray/titiler/xarray/io.py b/src/titiler/xarray/titiler/xarray/io.py
@@ -1,18 +1,27 @@
 """titiler.xarray.io"""
 
 import pickle
-import re
 from typing import Any, Callable, Dict, List, Optional, Protocol
+from urllib.parse import urlparse
 
 import attr
 import fsspec
 import numpy
-import s3fs
 import xarray
 from morecantile import TileMatrixSet
 from rio_tiler.constants import WEB_MERCATOR_TMS
 from rio_tiler.io.xarray import XarrayReader
 
+try:
+    import s3fs
+except ImportError:  # pragma: nocover
+    s3fs = None  # type: ignore
+
+try:
+    import aiohttp
+except ImportError:  # pragma: nocover
+    aiohttp = None  # type: ignore
+
 
 class CacheClient(Protocol):
     """CacheClient Protocol."""
@@ -26,27 +35,18 @@ def set(self, key: str, body: bytes) -> None:
         ...
 
 
-def parse_protocol(src_path: str, reference: Optional[bool] = False) -> str:
+def parse_protocol(src_path: str) -> str:
     """Parse protocol from path."""
-    match = re.match(r"^(s3|https|http)", src_path)
-    protocol = "file"
-    if match:
-        protocol = match.group(0)
-
-    # override protocol if reference
-    if reference:
-        protocol = "reference"
-
-    return protocol
+    parsed = urlparse(src_path)
+    return parsed.scheme or "file"
 
 
 def xarray_engine(src_path: str) -> str:
     """Parse xarray engine from path."""
     #  ".hdf", ".hdf5", ".h5" will be supported once we have tests + expand the type permitted for the group parameter
     if any(src_path.lower().endswith(ext) for ext in [".nc", ".nc4"]):
         return "h5netcdf"
-    else:
-        return "zarr"
+    return "zarr"
 
 
 def get_filesystem(
@@ -59,18 +59,27 @@ def get_filesystem(
     Get the filesystem for the given source path.
     """
     if protocol == "s3":
+        assert s3fs is not None, "s3fs must be installed to support S3:// url"
+
         s3_filesystem = s3fs.S3FileSystem()
         return (
             s3_filesystem.open(src_path)
             if xr_engine == "h5netcdf"
             else s3fs.S3Map(root=src_path, s3=s3_filesystem)
         )
-
-    elif protocol == "reference":
-        reference_args = {"fo": src_path, "remote_options": {"anon": anon}}
+    elif protocol == "reference" or src_path.lower().endswith(".json"):
+        reference_args = {
+            "fo": src_path.replace("reference://", ""),
+            "remote_options": {"anon": anon},
+        }
         return fsspec.filesystem("reference", **reference_args).get_mapper("")
 
-    elif protocol in ["https", "http", "file"]:
+    elif protocol in ["https", "http", "file", "reference"]:
+        if protocol.startswith("http"):
+            assert (
+                aiohttp is not None
+            ), "aiohttp must be installed to support HTTP:// url"
+
         filesystem = fsspec.filesystem(protocol)  # type: ignore
         return (
             filesystem.open(src_path)
@@ -85,9 +94,7 @@ def get_filesystem(
 def xarray_open_dataset(
     src_path: str,
     group: Optional[Any] = None,
-    reference: Optional[bool] = False,
     decode_times: Optional[bool] = True,
-    consolidated: Optional[bool] = True,
     cache_client: Optional[CacheClient] = None,
 ) -> xarray.Dataset:
     """Open dataset."""
@@ -98,7 +105,7 @@ def xarray_open_dataset(
         if data_bytes:
             return pickle.loads(data_bytes)
 
-    protocol = parse_protocol(src_path, reference=reference)
+    protocol = parse_protocol(src_path)
     xr_engine = xarray_engine(src_path)
     file_handler = get_filesystem(src_path, protocol, xr_engine)
 
@@ -117,17 +124,22 @@ def xarray_open_dataset(
     if xr_engine == "h5netcdf":
         xr_open_args["engine"] = "h5netcdf"
         xr_open_args["lock"] = False
-    else:
-        # Zarr arguments
-        xr_open_args["engine"] = "zarr"
-        xr_open_args["consolidated"] = consolidated
+        ds = xarray.open_dataset(file_handler, **xr_open_args)
+
+    elif protocol == "reference" or src_path.lower().endswith(".json"):
+        xr_open_args.update(
+            {
+                "engine": "zarr",
+                "consolidated": False,
+                "backend_kwargs": {"consolidated": False},
+            }
+        )
 
-    # Additional arguments when dealing with a reference file.
-    if reference:
-        xr_open_args["consolidated"] = False
-        xr_open_args["backend_kwargs"] = {"consolidated": False}
+        ds = xarray.open_dataset(file_handler, **xr_open_args)
 
-    ds = xarray.open_dataset(file_handler, **xr_open_args)
+    # Fallback to Zarr
+    else:
+        ds = xarray.open_zarr(file_handler, **xr_open_args)
 
     if cache_client:
         # Serialize the dataset to bytes using pickle
@@ -245,9 +257,7 @@ class Reader(XarrayReader):
     opener: Callable[..., xarray.Dataset] = attr.ib(default=xarray_open_dataset)
 
     group: Optional[Any] = attr.ib(default=None)
-    reference: bool = attr.ib(default=False)
     decode_times: bool = attr.ib(default=False)
-    consolidated: Optional[bool] = attr.ib(default=True)
     cache_client: Optional[CacheClient] = attr.ib(default=None)
 
     # xarray.DataArray options
@@ -266,9 +276,7 @@ def __attrs_post_init__(self):
         self.ds = self.opener(
             self.src_path,
             group=self.group,
-            reference=self.reference,
             decode_times=self.decode_times,
-            consolidated=self.consolidated,
             cache_client=self.cache_client,
         )
 
@@ -293,14 +301,10 @@ def list_variables(
         cls,
         src_path: str,
         group: Optional[Any] = None,
-        reference: Optional[bool] = False,
-        consolidated: Optional[bool] = True,
     ) -> List[str]:
         """List available variable in a dataset."""
         with xarray_open_dataset(
             src_path,
             group=group,
-            reference=reference,
-            consolidated=consolidated,
         ) as ds:
             return list(ds.data_vars)  # type: ignore