Skip to content

Commit

Permalink
VectorData Expand by Default via write_dataset (#1093)
Browse files Browse the repository at this point in the history
  • Loading branch information
mavaylon1 authored May 7, 2024
1 parent 126bdb1 commit 201b8c4
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 13 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
- Added `TypeConfigurator` to automatically wrap fields with `TermSetWrapper` according to a configuration file. @mavaylon1 [#1016](https://github.com/hdmf-dev/hdmf/pull/1016)
- Updated `TermSetWrapper` to support validating a single field within a compound array. @mavaylon1 [#1061](https://github.com/hdmf-dev/hdmf/pull/1061)
- Updated testing to not install in editable mode and not run `coverage` by default. @rly [#1107](https://github.com/hdmf-dev/hdmf/pull/1107)
- Updated the default behavior for writing HDF5 datasets to be expandandable datasets with chunking enabled by default. This does not override user set chunking parameters. @mavaylon1 [#1093](https://github.com/hdmf-dev/hdmf/pull/1093)

## HDMF 3.13.0 (March 20, 2024)

Expand Down
34 changes: 27 additions & 7 deletions src/hdmf/backends/hdf5/h5tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,10 @@ def copy_file(self, **kwargs):
'default': True},
{'name': 'herd', 'type': 'hdmf.common.resources.HERD',
'doc': 'A HERD object to populate with references.',
'default': None})
'default': None},
{'name': 'expandable', 'type': bool, 'default': True,
'doc': ('Bool to set whether datasets are expandable by setting the max shape for all dimensions',
'of a dataset to None and enabling auto-chunking by default.')})
def write(self, **kwargs):
"""Write the container to an HDF5 file."""
if self.__mode == 'r':
Expand Down Expand Up @@ -804,10 +807,16 @@ def close_linked_files(self):
'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently',
'default': True},
{'name': 'export_source', 'type': str,
'doc': 'The source of the builders when exporting', 'default': None})
'doc': 'The source of the builders when exporting', 'default': None},
{'name': 'expandable', 'type': bool, 'default': True,
'doc': ('Bool to set whether datasets are expandable by setting the max shape for all dimensions',
'of a dataset to None and enabling auto-chunking by default.')})
def write_builder(self, **kwargs):
f_builder = popargs('builder', kwargs)
link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs)
link_data, exhaust_dci, export_source = getargs('link_data',
'exhaust_dci',
'export_source',
kwargs)
self.logger.debug("Writing GroupBuilder '%s' to path '%s' with kwargs=%s"
% (f_builder.name, self.source, kwargs))
for name, gbldr in f_builder.groups.items():
Expand Down Expand Up @@ -978,6 +987,9 @@ def _filler():
'default': True},
{'name': 'export_source', 'type': str,
'doc': 'The source of the builders when exporting', 'default': None},
{'name': 'expandable', 'type': bool, 'default': True,
'doc': ('Bool to set whether datasets are expandable by setting the max shape for all dimensions',
'of a dataset to None and enabling auto-chunking by default.')},
returns='the Group that was created', rtype=Group)
def write_group(self, **kwargs):
parent, builder = popargs('parent', 'builder', kwargs)
Expand Down Expand Up @@ -1078,14 +1090,17 @@ def write_link(self, **kwargs):
'default': True},
{'name': 'export_source', 'type': str,
'doc': 'The source of the builders when exporting', 'default': None},
{'name': 'expandable', 'type': bool, 'default': True,
'doc': ('Bool to set whether datasets are expandable by setting the max shape for all dimensions',
'of a dataset to None and enabling auto-chunking by default.')},
returns='the Dataset that was created', rtype=Dataset)
def write_dataset(self, **kwargs): # noqa: C901
""" Write a dataset to HDF5
The function uses other dataset-dependent write functions, e.g,
``__scalar_fill__``, ``__list_fill__``, and ``__setup_chunked_dset__`` to write the data.
"""
parent, builder = popargs('parent', 'builder', kwargs)
parent, builder, expandable = popargs('parent', 'builder', 'expandable', kwargs)
link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs)
self.logger.debug("Writing DatasetBuilder '%s' to parent group '%s'" % (builder.name, parent.name))
if self.get_written(builder):
Expand All @@ -1102,6 +1117,7 @@ def write_dataset(self, **kwargs): # noqa: C901
data = data.data
else:
options['io_settings'] = {}

attributes = builder.attributes
options['dtype'] = builder.dtype
dset = None
Expand Down Expand Up @@ -1206,7 +1222,7 @@ def _filler():
return
# If the compound data type contains only regular data (i.e., no references) then we can write it as usual
else:
dset = self.__list_fill__(parent, name, data, options)
dset = self.__list_fill__(parent, name, data, expandable, options)
# Write a dataset containing references, i.e., a region or object reference.
# NOTE: we can ignore options['io_settings'] for scalar data
elif self.__is_ref(options['dtype']):
Expand Down Expand Up @@ -1301,7 +1317,7 @@ def _filler():
self.__dci_queue.append(dataset=dset, data=data)
# Write a regular in memory array (e.g., numpy array, list etc.)
elif hasattr(data, '__len__'):
dset = self.__list_fill__(parent, name, data, options)
dset = self.__list_fill__(parent, name, data, expandable, options)
# Write a regular scalar dataset
else:
dset = self.__scalar_fill__(parent, name, data, options)
Expand Down Expand Up @@ -1429,7 +1445,7 @@ def __chunked_iter_fill__(cls, parent, name, data, options=None):
return dset

@classmethod
def __list_fill__(cls, parent, name, data, options=None):
def __list_fill__(cls, parent, name, data, expandable, options=None):
# define the io settings and data type if necessary
io_settings = {}
dtype = None
Expand All @@ -1451,6 +1467,10 @@ def __list_fill__(cls, parent, name, data, options=None):
data_shape = (len(data),)
else:
data_shape = get_data_shape(data)
if expandable:
# Don't override existing settings
if 'maxshape' not in io_settings:
io_settings['maxshape'] = tuple([None]*len(data_shape))

# Create the dataset
try:
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/test_io_hdf5.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,5 +225,5 @@ def test_dataset_shape(self):
io.write_builder(self.builder)
builder = io.read_builder()
dset = builder['test_bucket']['foo_holder']['foo1']['my_data'].data
self.assertEqual(get_data_shape(dset), (10,))
self.assertEqual(get_data_shape(dset), (None,))
io.close()
35 changes: 30 additions & 5 deletions tests/unit/test_io_hdf5_h5tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from hdmf.testing import TestCase, remove_test_file
from hdmf.common.resources import HERD
from hdmf.term_set import TermSet, TermSetWrapper
from hdmf.utils import get_data_shape


from tests.unit.helpers.utils import (Foo, FooBucket, FooFile, get_foo_buildmanager,
Expand Down Expand Up @@ -163,6 +164,7 @@ def test_write_dataset_list(self):
self.io.write_dataset(self.f, DatasetBuilder('test_dataset', a.tolist(), attributes={}))
dset = self.f['test_dataset']
self.assertTrue(np.all(dset[:] == a))
self.assertEqual(get_data_shape(dset), (None, None, None))

def test_write_dataset_list_compress_gzip(self):
a = H5DataIO(np.arange(30).reshape(5, 2, 3),
Expand Down Expand Up @@ -266,9 +268,9 @@ def test_write_dataset_list_fillvalue(self):
self.assertEqual(dset.fillvalue, -1)

##########################################
# write_dataset tests: tables
# write_dataset tests: cmpd_dt
##########################################
def test_write_table(self):
def test_write_cmpd_dt(self):
cmpd_dt = np.dtype([('a', np.int32), ('b', np.float64)])
data = np.zeros(10, dtype=cmpd_dt)
data['a'][1] = 101
Expand All @@ -279,8 +281,9 @@ def test_write_table(self):
dset = self.f['test_dataset']
self.assertEqual(dset['a'].tolist(), data['a'].tolist())
self.assertEqual(dset['b'].tolist(), data['b'].tolist())
self.assertEqual(get_data_shape(dset), (None,))

def test_write_table_nested(self):
def test_write_cmpd_dt_nested(self):
b_cmpd_dt = np.dtype([('c', np.int32), ('d', np.float64)])
cmpd_dt = np.dtype([('a', np.int32), ('b', b_cmpd_dt)])
data = np.zeros(10, dtype=cmpd_dt)
Expand Down Expand Up @@ -739,12 +742,12 @@ def test_copy_h5py_dataset_h5dataio_input(self):
self.f['test_copy'][:].tolist())

def test_list_fill_empty(self):
dset = self.io.__list_fill__(self.f, 'empty_dataset', [], options={'dtype': int, 'io_settings': {}})
dset = self.io.__list_fill__(self.f, 'empty_dataset', [], True, options={'dtype': int, 'io_settings': {}})
self.assertTupleEqual(dset.shape, (0,))

def test_list_fill_empty_no_dtype(self):
with self.assertRaisesRegex(Exception, r"cannot add \S+ to [/\S]+ - could not determine type"):
self.io.__list_fill__(self.f, 'empty_dataset', [])
self.io.__list_fill__(self.f, 'empty_dataset', [], True)

def test_read_str(self):
a = ['a', 'bb', 'ccc', 'dddd', 'e']
Expand All @@ -763,6 +766,28 @@ def test_read_str(self):
'<HDF5 dataset "test_dataset": shape (5,), type "|O">')


class TestExpand(TestCase):
def setUp(self):
self.manager = get_foo_buildmanager()
self.path = get_temp_filepath()

def test_expand_false(self):
# Setup all the data we need
foo1 = Foo('foo1', [1, 2, 3, 4, 5], "I am foo1", 17, 3.14)
foobucket = FooBucket('bucket1', [foo1])
foofile = FooFile(buckets=[foobucket])

with HDF5IO(self.path, manager=self.manager, mode='w') as io:
io.write(foofile, expandable=False)

io = HDF5IO(self.path, manager=self.manager, mode='r')
read_foofile = io.read()
self.assertListEqual(foofile.buckets['bucket1'].foos['foo1'].my_data,
read_foofile.buckets['bucket1'].foos['foo1'].my_data[:].tolist())
self.assertEqual(get_data_shape(read_foofile.buckets['bucket1'].foos['foo1'].my_data),
(5,))


class TestRoundTrip(TestCase):

def setUp(self):
Expand Down

0 comments on commit 201b8c4

Please sign in to comment.