possible poc

hdmf-dev · Aug 3, 2024 · 2387a5e · 2387a5e
1 parent b251b78
commit 2387a5e
Show file tree

Hide file tree

Showing 5 changed files with 60 additions and 24 deletions.
diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py
@@ -1130,9 +1130,9 @@ def write_dataset(self, **kwargs):  # noqa: C901
             return None
         name = builder.name
         data = builder.data
+        matched_spec_shape = builder.spec_shapes
         dataio = None
         options = dict()  # dict with additional
-        # breakpoint()
         if isinstance(data, H5DataIO):
             options['io_settings'] = data.io_settings
             dataio = data
@@ -1244,7 +1244,7 @@ def _filler():
                 return
             # If the compound data type contains only regular data (i.e., no references) then we can write it as usual
             else:
-                dset = self.__list_fill__(parent, name, data, expandable, options)
+                dset = self.__list_fill__(parent, name, data, matched_spec_shape, expandable, options)
         # Write a dataset containing references, i.e., a region or object reference.
         # NOTE: we can ignore options['io_settings'] for scalar data
         elif self.__is_ref(options['dtype']):
@@ -1339,7 +1339,7 @@ def _filler():
                 self.__dci_queue.append(dataset=dset, data=data)
             # Write a regular in memory array (e.g., numpy array, list etc.)
             elif hasattr(data, '__len__'):
-                dset = self.__list_fill__(parent, name, data, expandable, options)
+                dset = self.__list_fill__(parent, name, data, matched_spec_shape, expandable, options)
             # Write a regular scalar dataset
             else:
                 dset = self.__scalar_fill__(parent, name, data, options)
@@ -1467,7 +1467,7 @@ def __chunked_iter_fill__(cls, parent, name, data, options=None):
         return dset
 
     @classmethod
-    def __list_fill__(cls, parent, name, data, expandable, options=None):
+    def __list_fill__(cls, parent, name, data, matched_spec_shape, expandable, options=None):
         # define the io settings and data type if necessary
         io_settings = {}
         dtype = None
@@ -1489,12 +1489,13 @@ def __list_fill__(cls, parent, name, data, expandable, options=None):
             data_shape = (len(data),)
         else:
             data_shape = get_data_shape(data)
-        # breakpoint()
         if expandable:
             # Don't override existing settings
             if 'maxshape' not in io_settings:
-                io_settings['maxshape'] = tuple([None]*len(data_shape))
-
+                if matched_spec_shape is not None:
+                    io_settings['maxshape'] = tuple([None]*len(matched_spec_shape))
+                else:
+                    io_settings['maxshape'] = tuple([None]*len(data_shape))
         # Create the dataset
         try:
             dset = parent.create_dataset(name, shape=data_shape, dtype=dtype, **io_settings)

diff --git a/src/hdmf/build/builders.py b/src/hdmf/build/builders.py
@@ -330,6 +330,9 @@ class DatasetBuilder(BaseBuilder):
              'doc': 'The datatype of this dataset.', 'default': None},
             {'name': 'attributes', 'type': dict,
              'doc': 'A dictionary of attributes to create in this dataset.', 'default': dict()},
+            {'name': 'spec_shapes', 'type': tuple,
+             'doc': ('The shape(s) defined in the spec.'),
+             'default': None},
             {'name': 'dimension_labels', 'type': tuple,
              'doc': ('A list of labels for each dimension of this dataset from the spec. Currently this is '
                      'supplied only on build.'),
@@ -341,22 +344,27 @@ class DatasetBuilder(BaseBuilder):
             {'name': 'source', 'type': str, 'doc': 'The source of the data in this builder.', 'default': None})
     def __init__(self, **kwargs):
         """ Create a Builder object for a dataset """
-        name, data, dtype, attributes, dimension_labels, maxshape, chunks, parent, source = getargs(
-            'name', 'data', 'dtype', 'attributes', 'dimension_labels', 'maxshape', 'chunks', 'parent', 'source',
-            kwargs
-        )
+        name, data, dtype, attributes, spec_shapes, dimension_labels, maxshape, chunks, parent, source = getargs(
+            'name', 'data', 'dtype', 'attributes', 'spec_shapes', 'dimension_labels', 'maxshape', 'chunks',
+            'parent', 'source', kwargs)
         super().__init__(name, attributes, parent, source)
         self['data'] = data
         self['attributes'] = _copy.copy(attributes)
         self.__dimension_labels = dimension_labels
         self.__chunks = chunks
+        self.__spec_shapes = spec_shapes
         self.__maxshape = maxshape
         if isinstance(data, BaseBuilder):
             if dtype is None:
                 dtype = self.OBJECT_REF_TYPE
         self.__dtype = dtype
         self.__name = name
 
+    @property
+    def spec_shapes(self):
+        """The shapes defined in the spec."""
+        return self.__spec_shapes
+
     @property
     def data(self):
         """The data stored in the dataset represented by this builder."""

diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py
@@ -722,9 +722,8 @@ def build(self, **kwargs):
                 if not isinstance(container, Data):
                     msg = "'container' must be of type Data with DatasetSpec"
                     raise ValueError(msg)
-                breakpoint()
                 spec_dtype, spec_shape, spec_dims, spec = self.__check_dset_spec(self.spec, spec_ext)
-                dimension_labels = self.__get_dimension_labels_from_spec(container.data, spec_shape, spec_dims)
+                dimension_labels, matched_shape = self.__get_spec_info(container.data, spec_shape, spec_dims)
                 if isinstance(spec_dtype, RefSpec):
                     self.logger.debug("Building %s '%s' as a dataset of references (source: %s)"
                                       % (container.__class__.__name__, container.name, repr(source)))
@@ -735,6 +734,7 @@ def build(self, **kwargs):
                         parent=parent,
                         source=source,
                         dtype=spec_dtype.reftype,
+                        spec_shapes=matched_shape,
                         dimension_labels=dimension_labels,
                     )
                     manager.queue_ref(self.__set_dataset_to_refs(builder, spec_dtype, spec_shape, container, manager))
@@ -749,6 +749,7 @@ def build(self, **kwargs):
                         parent=parent,
                         source=source,
                         dtype=spec_dtype,
+                        spec_shapes=matched_shape,
                         dimension_labels=dimension_labels,
                     )
                     manager.queue_ref(self.__set_compound_dataset_to_refs(builder, spec, spec_dtype, container,
@@ -767,6 +768,7 @@ def build(self, **kwargs):
                             parent=parent,
                             source=source,
                             dtype="object",
+                            spec_shapes=matched_shape,
                             dimension_labels=dimension_labels,
                         )
                         manager.queue_ref(self.__set_untyped_dataset_to_refs(builder, container, manager))
@@ -790,6 +792,7 @@ def build(self, **kwargs):
                             parent=parent,
                             source=source,
                             dtype=dtype,
+                            spec_shapes=matched_shape,
                             dimension_labels=dimension_labels,
                         )
 
@@ -821,9 +824,10 @@ def __check_dset_spec(self, orig, ext):
             spec = ext
         return dtype, shape, dims, spec
 
-    def __get_dimension_labels_from_spec(self, data, spec_shape, spec_dims) -> tuple:
+    def __get_spec_info(self, data, spec_shape, spec_dims):
+        """This will return the dimension labels and shape by matching the data shape to a permissible spec shape."""
         if spec_shape is None or spec_dims is None:
-            return None
+            return None, None
         data_shape = get_data_shape(data)
         # if shape is a list of allowed shapes, find the index of the shape that matches the data
         if isinstance(spec_shape[0], list):
@@ -843,22 +847,22 @@ def __get_dimension_labels_from_spec(self, data, spec_shape, spec_dims) -> tuple
             # use the most specific match -- the one with the fewest Nones
             if match_shape_inds:
                 if len(match_shape_inds) == 1:
-                    return tuple(spec_dims[match_shape_inds[0]])
+                    return tuple(spec_dims[match_shape_inds[0]]), tuple(spec_shape[match_shape_inds[0]])
                 else:
                     count_nones = [len([x for x in spec_shape[k] if x is None]) for k in match_shape_inds]
                     index_min_count = count_nones.index(min(count_nones))
                     best_match_ind = match_shape_inds[index_min_count]
-                    return tuple(spec_dims[best_match_ind])
+                    return tuple(spec_dims[best_match_ind]), tuple(spec_shape[best_match_ind])
             else:
                 # no matches found
                 msg = "Shape of data does not match any allowed shapes in spec '%s'" % self.spec.path
                 warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
-                return None
+                return None, None
         else:
             if len(data_shape) != len(spec_shape):
                 msg = "Shape of data does not match shape in spec '%s'" % self.spec.path
                 warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
-                return None
+                return None, None
             # check each dimension. None means any length is allowed
             match = True
             for j, d in enumerate(data_shape):
@@ -868,9 +872,9 @@ def __get_dimension_labels_from_spec(self, data, spec_shape, spec_dims) -> tuple
             if not match:
                 msg = "Shape of data does not match shape in spec '%s'" % self.spec.path
                 warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
-                return None
+                return None, None
             # shape is a single list of allowed dimension lengths
-            return tuple(spec_dims)
+            return tuple(spec_dims), tuple(spec_shape)
 
     def __is_reftype(self, data):
         if (isinstance(data, AbstractDataChunkIterator) or

diff --git a/tests/unit/test_io_hdf5.py b/tests/unit/test_io_hdf5.py
@@ -225,5 +225,5 @@ def test_dataset_shape(self):
         io.write_builder(self.builder)
         builder = io.read_builder()
         dset = builder['test_bucket']['foo_holder']['foo1']['my_data'].data
-        self.assertEqual(get_data_shape(dset), (10,))
+        self.assertEqual(get_data_shape(dset), (None,))
         io.close()
diff --git a/tests/unit/test_io_hdf5_h5tools.py b/tests/unit/test_io_hdf5_h5tools.py
@@ -28,6 +28,7 @@
 from hdmf.testing import TestCase, remove_test_file
 from hdmf.common.resources import HERD
 from hdmf.term_set import TermSet, TermSetWrapper
+from hdmf.utils import get_data_shape
 
 
 from tests.unit.helpers.utils import (Foo, FooBucket, FooFile, get_foo_buildmanager,
@@ -739,12 +740,12 @@ def test_copy_h5py_dataset_h5dataio_input(self):
                              self.f['test_copy'][:].tolist())
 
     def test_list_fill_empty(self):
-        dset = self.io.__list_fill__(self.f, 'empty_dataset', [], options={'dtype': int, 'io_settings': {}})
+        dset = self.io.__list_fill__(self.f, 'empty_dataset', [], None, True, options={'dtype': int, 'io_settings': {}})
         self.assertTupleEqual(dset.shape, (0,))
 
     def test_list_fill_empty_no_dtype(self):
         with self.assertRaisesRegex(Exception, r"cannot add \S+ to [/\S]+ - could not determine type"):
-            self.io.__list_fill__(self.f, 'empty_dataset', [])
+            self.io.__list_fill__(self.f, 'empty_dataset', [], None, True)
 
     def test_read_str(self):
         a = ['a', 'bb', 'ccc', 'dddd', 'e']
@@ -3725,3 +3726,25 @@ def test_set_data_io(self):
         self.data.set_data_io(H5DataIO, dict(chunks=True))
         assert isinstance(self.data.data, H5DataIO)
         assert self.data.data.io_settings["chunks"]
+
+
+class TestExpand(TestCase):
+    def setUp(self):
+        self.manager = get_foo_buildmanager()
+        self.path = get_temp_filepath()
+
+    def test_expand_false(self):
+        # Setup all the data we need
+        foo1 = Foo('foo1', [1, 2, 3, 4, 5], "I am foo1", 17, 3.14)
+        foobucket = FooBucket('bucket1', [foo1])
+        foofile = FooFile(buckets=[foobucket])
+
+        with HDF5IO(self.path, manager=self.manager, mode='w') as io:
+            io.write(foofile, expandable=False)
+
+        io = HDF5IO(self.path, manager=self.manager, mode='r')
+        read_foofile = io.read()
+        self.assertListEqual(foofile.buckets['bucket1'].foos['foo1'].my_data,
+                                 read_foofile.buckets['bucket1'].foos['foo1'].my_data[:].tolist())
+        self.assertEqual(get_data_shape(read_foofile.buckets['bucket1'].foos['foo1'].my_data),
+                        (5,))