Externallinkfix (#153)

* Show linked chunk counts * support hdf5:// prefix in external link refs * updates for numpy deprecation errors
HDFGroup · Sep 28, 2023 · 0f10aa1 · 0f10aa1
1 parent ca1f8ae
commit 0f10aa1
Show file tree

Hide file tree

Showing 24 changed files with 165 additions and 344 deletions.
diff --git a/h5pyd/_apps/hsls.py b/h5pyd/_apps/hsls.py
@@ -145,32 +145,60 @@ def dump(name, obj, visited=None):
         if isinstance(obj.id.layout, dict):
             # H5D_CHUNKED_REF layout
             chunk_dims = obj.id.layout["dims"]
-            storage_desc = "Storage " + obj.id.layout["class"]
+            obj_layout = obj.id.layout["class"]
         else:
             chunk_dims = obj.chunks
-            storage_desc = "Storage H5D_CHUNKED"
-        for chunk_dim in chunk_dims:
+            obj_layout = "H5D_CHUNKED"
+        storage_desc = f"Storage {obj_layout}"
+        max_chunk_count = 1
+        rank = len(obj.shape)
+        for i in range(rank):
+            extent = obj.shape[i]
+            chunk_dim = chunk_dims[i]
             chunk_size *= chunk_dim
+            max_chunk_count *= -(-extent // chunk_dim)
         dset_size = obj.dtype.itemsize
         for dim_extent in obj.shape:
             dset_size *= dim_extent
 
-        num_chunks = obj.num_chunks
-        allocated_size = obj.allocated_size
+        if obj_layout == "H5D_CHUNKED_REF_INDIRECT":
+            chunk_table_id = obj.id.layout["chunk_table"]
+            chunk_table = obj.file[f"datasets/{chunk_table_id}"]
+            num_chunks = int(np.prod(chunk_table.shape))
+            chunk_table_elements = chunk_table[...].reshape((num_chunks,))
+            num_linked_chunks = 0
+            allocated_size = 0
+            for e in chunk_table_elements:
+                chunk_offset = e[0]
+                chunk_size = e[1]
+                if chunk_offset > 0 and chunk_size > 0:
+                    num_linked_chunks += 1
+                    allocated_size += chunk_size
+            num_chunks = num_linked_chunks
+            chunk_type = "linked"
+
+        else:
+            num_chunks = obj.num_chunks
+            allocated_size = obj.allocated_size
+            chunk_type = "allocated"
+
         if num_chunks is not None and allocated_size is not None:
-            fstr = "    {0:>32}: {1} {2} bytes, {3} allocated chunks"
-            print(fstr.format("Chunks", chunk_dims, intToStr(chunk_size),
-                              intToStr(num_chunks)))
+            fstr = "    {0:>32}: {1} {2} bytes, {3}/{4} {5} chunks"
+
+            s = fstr.format("Chunks", chunk_dims, intToStr(chunk_size), intToStr(num_chunks), 
+                            intToStr(max_chunk_count), chunk_type)
+            print(s)
             if dset_size > 0:
                 utilization = allocated_size / dset_size
-                fstr = "    {0:>32}: {1} logical bytes, {2} allocated bytes, {3:.2f}% utilization"
+                fstr = "    {0:>32}: {1} logical bytes, {2} {3} bytes, {4:.2f}% utilization"
                 print(fstr.format(storage_desc, intToStr(dset_size),
                                   intToStr(allocated_size),
+                                  chunk_type,
                                   utilization * 100.0))
             else:
-                fstr = "    {0:>32}: {1} logical bytes, {2} allocated bytes"
+                fstr = "    {0:>32}: {1} logical bytes, {2} {3} bytes"
                 print(fstr.format(storage_desc, intToStr(dset_size),
-                                  intToStr(allocated_size)))
+                                  intToStr(allocated_size), chunk_type))
 
         else:
             # verbose info not available, just show the chunk layout

diff --git a/h5pyd/_apps/hstouch.py b/h5pyd/_apps/hstouch.py
@@ -69,10 +69,6 @@ def touchDomain(domain):
     parent_domain = getParentDomain(domain)
 
     if parent_domain == "/":
-        if not domain.endswith("/"):
-            msg = "Only folders can be created as a top-level domain"
-            logging.error(msg)
-            sys.exit(msg)
         if len(domain) < 4:
             msg = "Top-level folders must be at least three characters"
             logging.error(msg)

diff --git a/h5pyd/_hl/attrs.py b/h5pyd/_hl/attrs.py
@@ -252,7 +252,7 @@ def create(self, name, data, shape=None, dtype=None):
             # Not an array type; make sure to check the number of elements
             # is compatible, and reshape if needed.
             else:
-                if numpy.product(shape) != numpy.product(data.shape):
+                if numpy.prod(shape) != numpy.prod(data.shape):
                     raise ValueError("Shape of new attribute conflicts with shape of data")
 
                 if shape != data.shape:
@@ -321,7 +321,7 @@ def modify(self, name, value):
 
                 # Allow the case of () <-> (1,)
                 if (value.shape != attr.shape) and not \
-                   (numpy.product(value.shape) == 1 and numpy.product(attr.shape) == 1):
+                   (numpy.prod(value.shape) == 1 and numpy.prod(attr.shape) == 1):
                     raise TypeError("Shape of data is incompatible with existing attribute")
                 attr.write(value)
         """

diff --git a/h5pyd/_hl/base.py b/h5pyd/_hl/base.py
@@ -464,9 +464,10 @@ def getElementCount(buffer, offset):
     count_bytes = bytes(buffer[offset:(offset+4)])
 
     try:
-        count = int(np.frombuffer(count_bytes, dtype="<i4"))
+        arr =np.frombuffer(count_bytes, dtype="<i4")
+        count = int(arr[0])
     except TypeError as e:
-        msg = "Unexpected error reading count value for variable length elemennt: {}".format(e)
+        msg = f"Unexpected error reading count value for variable length elemennt: {e}"
         raise TypeError(msg)
     if count < 0:
         # shouldn't be negative

diff --git a/h5pyd/_hl/dataset.py b/h5pyd/_hl/dataset.py
@@ -104,8 +104,8 @@ def make_new_dset(
     else:
         shape = (shape,) if isinstance(shape, int) else tuple(shape)
         if data is not None and (
-            numpy.product(shape, dtype=numpy.ulonglong)
-            != numpy.product(data.shape, dtype=numpy.ulonglong)
+            numpy.prod(shape, dtype=numpy.ulonglong)
+            != numpy.prod(data.shape, dtype=numpy.ulonglong)
         ):
             raise ValueError("Shape tuple is incompatible with data")
 
@@ -399,7 +399,6 @@ def __init__(self, dset, source_sel=None):
 
         if not dset.chunks:
             # can only use with chunked datasets
-            # (currently all datasets are chunked, but check for future compat)
             raise TypeError("Chunked dataset required")
 
         if isinstance(dset.chunks, dict):
@@ -426,22 +425,15 @@ def __init__(self, dset, source_sel=None):
         for dim in range(rank):
             s = self._sel[dim]
             if s.start < 0 or s.stop > self._shape[dim] or s.stop <= s.start:
-                raise ValueError(
-                    "Invalid selection - selection region must be within dataset space"
-                )
+                msg = "Invalid selection - selection region must be within dataset space"
+                raise ValueError(msg)
             index = s.start // self._layout[dim]
             self._chunk_index.append(index)
 
     def __iter__(self):
         return self
 
     def __next__(self):
-        def get_ret(item):
-            if len(item) == 1:
-                return item[0]
-            else:
-                return tuple(item)
-
         rank = len(self._shape)
         slices = []
         if rank == 0 or self._chunk_index[0] * self._layout[0] >= self._sel[0].stop:
@@ -475,7 +467,7 @@ def get_ret(item):
                 # reset to the start and continue iterating with higher dimension
                 self._chunk_index[dim] = 0
             dim -= 1
-        return get_ret(slices)
+        return tuple(slices)
 
 
 class Dataset(HLObject):
@@ -910,7 +902,7 @@ def _getQueryParam(self, start, stop, step=None):
             step = (1,) * rank
         param += "["
         for i in range(rank):
-            field = "{}:{}:{}".format(start[i], stop[i], step[i])
+            field = f"{start[i]}:{stop[i]}:{step[i]}"
             param += field
             if i != (rank - 1):
                 param += ","
@@ -973,7 +965,7 @@ def __getitem__(self, args, new_dtype=None):
             mshape = sel.guess_shape(sid)
             if mshape is None:
                 return numpy.array((0,), dtype=new_dtype)
-            if numpy.product(mshape) == 0:
+            if numpy.prod(mshape) == 0:
                 return numpy.array(mshape, dtype=new_dtype)
             out = numpy.empty(mshape, dtype=new_dtype)
             sid_out = h5s.create_simple(mshape)
@@ -993,7 +985,7 @@ def __getitem__(self, args, new_dtype=None):
 
         if self._shape == ():
             selection = sel.select(self, args)
-            self.log.info("selection.mshape: {}".format(selection.mshape))
+            self.log.info(f"selection.mshape: {selection.mshape}")
 
             # TBD - refactor the following with the code for the non-scalar case
             req = "/datasets/" + self.id.uuid + "/value"
@@ -1006,7 +998,7 @@ def __getitem__(self, args, new_dtype=None):
                 arr = bytesToArray(rsp, new_dtype, self._shape)
 
                 if not self.dtype.shape:
-                    self.log.debug("reshape arr to: {}".format(self._shape))
+                    self.log.debug(f"reshape arr to: {self._shape}")
                     arr = numpy.reshape(arr, self._shape)
             else:
                 # got JSON response
@@ -1024,11 +1016,8 @@ def __getitem__(self, args, new_dtype=None):
                 arr = numpy.empty((), dtype=new_dtype)
                 arr[()] = data
             if selection.mshape is None:
-                self.log.info(
-                    "return scalar selection of: {}, dtype: {}, shape: {}".format(
-                        arr, arr.dtype, arr.shape
-                    )
-                )
+                msg = f"return scalar selection of: {arr}, dtype: {arr.dtype}, shape: {arr.shape}"
+                self.log.info(msg)
                 val = arr[()]
                 if isinstance(val, str):
                     # h5py always returns bytes, so encode the str
@@ -1308,9 +1297,7 @@ def __getitem__(self, args, new_dtype=None):
                     points, dtype="u8"
                 )  # must use unsigned 64-bit int
                 body = arr_points.tobytes()
-                self.log.info(
-                    "point select binary request, num bytes: {}".format(len(body))
-                )
+                self.log.info(f"point select binary request, num bytes: {len(body)}")
             else:
                 if delistify:
                     self.log.info("delistifying point selection")
@@ -1324,7 +1311,7 @@ def __getitem__(self, args, new_dtype=None):
                 else:
                     # can just assign
                     body["points"] = points
-                self.log.info("sending point selection request: {}".format(body))
+                self.log.info(f"sending point selection request: {body}")
             rsp = self.POST(req, format=format, body=body)
             if type(rsp) in (bytes, bytearray):
                 if len(rsp) // mtype.itemsize != selection.mshape[0]:
@@ -1337,18 +1324,14 @@ def __getitem__(self, args, new_dtype=None):
             else:
                 data = rsp["value"]
                 if len(data) != selection.mshape[0]:
-                    raise IOError(
-                        "Expected {} elements, but got {}".format(
-                            selection.mshape[0], len(data)
-                        )
-                    )
-
+                    msg = f"Expected {selection.mshape[0]} elements, but got {len(data)}"
+                    raise IOError(msg)
                 arr = numpy.asarray(data, dtype=mtype, order="C")
 
         else:
             raise ValueError("selection type not supported")
 
-        self.log.info("got arr: {}, cleaning up shape!".format(arr.shape))
+        self.log.info(f"got arr: {arr.shape}, cleaning up shape!")
         # Patch up the output for NumPy
         if len(names) == 1:
             arr = arr[names[0]]  # Single-field recarray convention
@@ -1368,7 +1351,7 @@ def __setitem__(self, args, val):
         (slices and integers).  For advanced indexing, the shapes must
         match.
         """
-        self.log.info("Dataset __setitem__, args: {}".format(args))
+        self.log.info(f"Dataset __setitem__, args: {args}")
         use_base64 = True  # may need to set this to false below for some types
 
         args = args if isinstance(args, tuple) else (args,)
@@ -1378,7 +1361,7 @@ def __setitem__(self, args, val):
             self.log.debug(
                 f"val dtype: {val.dtype}, shape: {val.shape} metadata: {val.dtype.metadata}"
             )
-            if numpy.product(val.shape) == 0:
+            if numpy.prod(val.shape) == 0:
                 self.log.info("no elements in numpy array, skipping write")
         except AttributeError:
             self.log.debug("val not ndarray")
@@ -1428,7 +1411,7 @@ def __setitem__(self, args, val):
                         i
                         for i in val.reshape(
                             (
-                                numpy.product(val.shape[:-1], dtype=numpy.ulonglong),
+                                numpy.prod(val.shape[:-1], dtype=numpy.ulonglong),
                                 val.shape[-1],
                             )
                         )
@@ -1480,7 +1463,7 @@ def __setitem__(self, args, val):
             # TBD - need to handle cases where the type shape is different
             self.log.debug("got numpy array")
             if val.dtype != self.dtype and val.dtype.shape == self.dtype.shape:
-                self.log.info("converting {} to {}".format(val.dtype, self.dtype))
+                self.log.info(f"converting {val.dtype} to {self.dtype}")
                 # convert array
                 tmp = numpy.empty(val.shape, dtype=self.dtype)
                 tmp[...] = val[...]
@@ -1584,15 +1567,13 @@ def __setitem__(self, args, val):
                 data = val.tobytes()
                 data = base64.b64encode(data)
                 data = data.decode("ascii")
-                self.log.debug("data: {}".format(data))
                 body["value_base64"] = data
-                self.log.debug("writing base64 data, {} bytes".format(len(data)))
+                self.log.debug(f"writing base64 data, {len(data)} bytes")
         else:
             if type(val) is not list:
                 val = val.tolist()
             val = _decode(val)
-            self.log.debug("writing json data, {} elements".format(len(val)))
-            self.log.debug("data: {}".format(val))
+            self.log.debug(f"writing json data, {len(val)} elements")
             body["value"] = val
 
         if selection.select_type != sel.H5S_SELECT_ALL:
@@ -1702,7 +1683,7 @@ def __array__(self, dtype=None):
         arr = numpy.empty(self._shape, dtype=self.dtype if dtype is None else dtype)
 
         # Special case for (0,)*-shape datasets
-        if self._shape is None or numpy.product(self._shape) == 0:
+        if self._shape is None or numpy.prod(self._shape) == 0:
             return arr
 
         self.read_direct(arr)

diff --git a/h5pyd/_hl/filters.py b/h5pyd/_hl/filters.py
@@ -306,7 +306,7 @@ def guess_chunk(shape, maxshape, typesize):
 
     # Determine the optimal chunk size in bytes using a PyTables expression.
     # This is kept as a float.
-    dset_size = np.product(chunks) * typesize
+    dset_size = np.prod(chunks) * typesize
     target_size = CHUNK_BASE * (2 ** np.log10(dset_size / (1024.0 * 1024)))
 
     if target_size > CHUNK_MAX:
@@ -321,15 +321,15 @@ def guess_chunk(shape, maxshape, typesize):
         # 1b. We're within 50% of the target chunk size, AND
         #  2. The chunk is smaller than the maximum chunk size
 
-        chunk_bytes = np.product(chunks) * typesize
+        chunk_bytes = np.prod(chunks) * typesize
 
         if (
             chunk_bytes < target_size
             or abs(chunk_bytes - target_size) / target_size < 0.5
         ) and chunk_bytes < CHUNK_MAX:
             break
 
-        if np.product(chunks) == 1:
+        if np.prod(chunks) == 1:
             break  # Element size larger than CHUNK_MAX
 
         chunks[idx % ndims] = np.ceil(chunks[idx % ndims] / 2.0)

diff --git a/h5pyd/_hl/group.py b/h5pyd/_hl/group.py
@@ -651,14 +651,15 @@ def __getitem__(self, name):
             #  (and hince the httpconn socket won't be closed)
             from .files import File
             external_domain = link_json['h5domain']
-            if not op.isabs(external_domain):
+            if not external_domain.startswith("hdf5://") and not op.isabs(external_domain):
                 current_domain = self._id.http_conn.domain
                 external_domain = op.join(op.dirname(current_domain), external_domain)
                 external_domain = op.normpath(external_domain)
             try:
                 endpoint = self.id.http_conn.endpoint
                 username = self.id.http_conn.username
                 password = self.id.http_conn.password
+                print(external_domain)
                 f = File(external_domain, endpoint=endpoint, username=username, password=password, mode='r') 
             except IOError:
                 # unable to find external link