Merge branch 'dev' into zarr_append

hdmf-dev · Jul 28, 2024 · dcea8a0 · dcea8a0
2 parents 27bb840 + 4c32820
commit dcea8a0
Show file tree

Hide file tree

Showing 6 changed files with 424 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,12 @@
 # HDMF Changelog
 
+## HDMF 3.14.3 (Upcoming)
+
+### Enhancements
+- Added new attribute "dimension_labels" on `DatasetBuilder` which specifies the names of the dimensions used in the
+dataset based on the shape of the dataset data and the dimension names in the spec for the data type. This attribute
+is available on build (during the write process), but not on read of a dataset from a file. @rly [#1081](https://github.com/hdmf-dev/hdmf/pull/1081)
+
 ## HDMF 3.14.2 (July 7, 2024)
 
 ### Enhancements

diff --git a/src/hdmf/backends/hdf5/h5_utils.py b/src/hdmf/backends/hdf5/h5_utils.py
@@ -554,10 +554,31 @@ def __init__(self, **kwargs):
 
     @property
     def dataset(self):
+        """Get the cached h5py.Dataset."""
         return self.__dataset
 
     @dataset.setter
     def dataset(self, val):
+        """Cache the h5py.Dataset written with the stored IO settings.
+
+        This attribute can be used to cache a written, empty dataset and fill it in later.
+        This allows users to access the handle to the dataset *without* having to close
+        and reopen a file.
+
+        For example::
+
+            dataio = H5DataIO(shape=(5,), dtype=int)
+            foo = Foo('foo1', dataio, "I am foo1", 17, 3.14)
+            bucket = FooBucket('bucket1', [foo])
+            foofile = FooFile(buckets=[bucket])
+
+            io = HDF5IO(self.path, manager=self.manager, mode='w')
+            # write the object to disk, including initializing an empty int dataset with shape (5,)
+            io.write(foofile)
+
+            foo.my_data.dataset[:] = [0, 1, 2, 3, 4]
+            io.close()
+        """
         if self.__dataset is not None:
             raise ValueError("Cannot overwrite H5DataIO.dataset")
         self.__dataset = val

diff --git a/src/hdmf/build/builders.py b/src/hdmf/build/builders.py
@@ -330,18 +330,25 @@ class DatasetBuilder(BaseBuilder):
              'doc': 'The datatype of this dataset.', 'default': None},
             {'name': 'attributes', 'type': dict,
              'doc': 'A dictionary of attributes to create in this dataset.', 'default': dict()},
+            {'name': 'dimension_labels', 'type': tuple,
+             'doc': ('A list of labels for each dimension of this dataset from the spec. Currently this is '
+                     'supplied only on build.'),
+             'default': None},
             {'name': 'maxshape', 'type': (int, tuple),
              'doc': 'The shape of this dataset. Use None for scalars.', 'default': None},
             {'name': 'chunks', 'type': bool, 'doc': 'Whether or not to chunk this dataset.', 'default': False},
             {'name': 'parent', 'type': GroupBuilder, 'doc': 'The parent builder of this builder.', 'default': None},
             {'name': 'source', 'type': str, 'doc': 'The source of the data in this builder.', 'default': None})
     def __init__(self, **kwargs):
         """ Create a Builder object for a dataset """
-        name, data, dtype, attributes, maxshape, chunks, parent, source = getargs(
-            'name', 'data', 'dtype', 'attributes', 'maxshape', 'chunks', 'parent', 'source', kwargs)
+        name, data, dtype, attributes, dimension_labels, maxshape, chunks, parent, source = getargs(
+            'name', 'data', 'dtype', 'attributes', 'dimension_labels', 'maxshape', 'chunks', 'parent', 'source',
+            kwargs
+        )
         super().__init__(name, attributes, parent, source)
         self['data'] = data
         self['attributes'] = _copy.copy(attributes)
+        self.__dimension_labels = dimension_labels
         self.__chunks = chunks
         self.__maxshape = maxshape
         if isinstance(data, BaseBuilder):
@@ -361,6 +368,11 @@ def data(self, val):
             raise AttributeError("Cannot overwrite data.")
         self['data'] = val
 
+    @property
+    def dimension_labels(self):
+        """Labels for each dimension of this dataset from the spec."""
+        return self.__dimension_labels
+
     @property
     def chunks(self):
         """Whether or not this dataset is chunked."""

diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py
@@ -10,6 +10,7 @@
 from .errors import (BuildError, OrphanContainerBuildError, ReferenceTargetNotBuiltError, ContainerConfigurationError,
                      ConstructError)
 from .manager import Proxy, BuildManager
+
 from .warnings import MissingRequiredBuildWarning, DtypeConversionWarning, IncorrectQuantityBuildWarning
 from hdmf.backends.hdf5.h5_utils import H5DataIO
 
@@ -19,7 +20,7 @@
 from ..query import ReferenceResolver
 from ..spec import Spec, AttributeSpec, DatasetSpec, GroupSpec, LinkSpec, RefSpec
 from ..spec.spec import BaseStorageSpec
-from ..utils import docval, getargs, ExtenderMeta, get_docval
+from ..utils import docval, getargs, ExtenderMeta, get_docval, get_data_shape
 
 _const_arg = '__constructor_arg'
 
@@ -723,19 +724,34 @@ def build(self, **kwargs):
                 if not isinstance(container, Data):
                     msg = "'container' must be of type Data with DatasetSpec"
                     raise ValueError(msg)
-                spec_dtype, spec_shape, spec = self.__check_dset_spec(self.spec, spec_ext)
+                spec_dtype, spec_shape, spec_dims, spec = self.__check_dset_spec(self.spec, spec_ext)
+                dimension_labels = self.__get_dimension_labels_from_spec(container.data, spec_shape, spec_dims)
                 if isinstance(spec_dtype, RefSpec):
                     self.logger.debug("Building %s '%s' as a dataset of references (source: %s)"
                                       % (container.__class__.__name__, container.name, repr(source)))
                     # create dataset builder with data=None as a placeholder. fill in with refs later
-                    builder = DatasetBuilder(name, data=None, parent=parent, source=source, dtype=spec_dtype.reftype)
+                    builder = DatasetBuilder(
+                        name,
+                        data=None,
+                        parent=parent,
+                        source=source,
+                        dtype=spec_dtype.reftype,
+                        dimension_labels=dimension_labels,
+                    )
                     manager.queue_ref(self.__set_dataset_to_refs(builder, spec_dtype, spec_shape, container, manager))
                 elif isinstance(spec_dtype, list):
                     # a compound dataset
                     self.logger.debug("Building %s '%s' as a dataset of compound dtypes (source: %s)"
                                       % (container.__class__.__name__, container.name, repr(source)))
                     # create dataset builder with data=None, dtype=None as a placeholder. fill in with refs later
-                    builder = DatasetBuilder(name, data=None, parent=parent, source=source, dtype=spec_dtype)
+                    builder = DatasetBuilder(
+                        name,
+                        data=None,
+                        parent=parent,
+                        source=source,
+                        dtype=spec_dtype,
+                        dimension_labels=dimension_labels,
+                    )
                     manager.queue_ref(self.__set_compound_dataset_to_refs(builder, spec, spec_dtype, container,
                                                                           manager))
                 else:
@@ -746,7 +762,14 @@ def build(self, **kwargs):
                                           % (container.__class__.__name__, container.name, repr(source)))
                         # an unspecified dtype and we were given references
                         # create dataset builder with data=None as a placeholder. fill in with refs later
-                        builder = DatasetBuilder(name, data=None, parent=parent, source=source, dtype='object')
+                        builder = DatasetBuilder(
+                            name,
+                            data=None,
+                            parent=parent,
+                            source=source,
+                            dtype="object",
+                            dimension_labels=dimension_labels,
+                        )
                         manager.queue_ref(self.__set_untyped_dataset_to_refs(builder, container, manager))
                     else:
                         # a dataset that has no references, pass the conversion off to the convert_dtype method
@@ -762,7 +785,14 @@ def build(self, **kwargs):
                         except Exception as ex:
                             msg = 'could not resolve dtype for %s \'%s\'' % (type(container).__name__, container.name)
                             raise Exception(msg) from ex
-                        builder = DatasetBuilder(name, bldr_data, parent=parent, source=source, dtype=dtype)
+                        builder = DatasetBuilder(
+                            name,
+                            data=bldr_data,
+                            parent=parent,
+                            source=source,
+                            dtype=dtype,
+                            dimension_labels=dimension_labels,
+                        )
 
         # Add attributes from the specification extension to the list of attributes
         all_attrs = self.__spec.attributes + getattr(spec_ext, 'attributes', tuple())
@@ -781,14 +811,67 @@ def __check_dset_spec(self, orig, ext):
         """
         dtype = orig.dtype
         shape = orig.shape
+        dims = orig.dims
         spec = orig
         if ext is not None:
             if ext.dtype is not None:
                 dtype = ext.dtype
             if ext.shape is not None:
                 shape = ext.shape
+                dims = ext.dims
             spec = ext
-        return dtype, shape, spec
+        return dtype, shape, dims, spec
+
+    def __get_dimension_labels_from_spec(self, data, spec_shape, spec_dims) -> tuple:
+        if spec_shape is None or spec_dims is None:
+            return None
+        data_shape = get_data_shape(data)
+        # if shape is a list of allowed shapes, find the index of the shape that matches the data
+        if isinstance(spec_shape[0], list):
+            match_shape_inds = list()
+            for i, s in enumerate(spec_shape):
+                # skip this shape if it has a different number of dimensions from the data
+                if len(s) != len(data_shape):
+                    continue
+                # check each dimension. None means any length is allowed
+                match = True
+                for j, d in enumerate(data_shape):
+                    if s[j] is not None and s[j] != d:
+                        match = False
+                        break
+                if match:
+                    match_shape_inds.append(i)
+            # use the most specific match -- the one with the fewest Nones
+            if match_shape_inds:
+                if len(match_shape_inds) == 1:
+                    return tuple(spec_dims[match_shape_inds[0]])
+                else:
+                    count_nones = [len([x for x in spec_shape[k] if x is None]) for k in match_shape_inds]
+                    index_min_count = count_nones.index(min(count_nones))
+                    best_match_ind = match_shape_inds[index_min_count]
+                    return tuple(spec_dims[best_match_ind])
+            else:
+                # no matches found
+                msg = "Shape of data does not match any allowed shapes in spec '%s'" % self.spec.path
+                warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
+                return None
+        else:
+            if len(data_shape) != len(spec_shape):
+                msg = "Shape of data does not match shape in spec '%s'" % self.spec.path
+                warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
+                return None
+            # check each dimension. None means any length is allowed
+            match = True
+            for j, d in enumerate(data_shape):
+                if spec_shape[j] is not None and spec_shape[j] != d:
+                    match = False
+                    break
+            if not match:
+                msg = "Shape of data does not match shape in spec '%s'" % self.spec.path
+                warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
+                return None
+            # shape is a single list of allowed dimension lengths
+            return tuple(spec_dims)
 
     def __is_reftype(self, data):
         if (isinstance(data, AbstractDataChunkIterator) or

diff --git a/src/hdmf/build/warnings.py b/src/hdmf/build/warnings.py
@@ -15,6 +15,13 @@ class IncorrectQuantityBuildWarning(BuildWarning):
     pass
 
 
+class IncorrectDatasetShapeBuildWarning(BuildWarning):
+    """
+    Raised when a dataset has a shape that is not allowed by the spec.
+    """
+    pass
+
+
 class MissingRequiredBuildWarning(BuildWarning):
     """
     Raised when a required field is missing.