Skip to content

Commit

Permalink
Merge branch 'dev' into zarr_append
Browse files Browse the repository at this point in the history
  • Loading branch information
mavaylon1 authored Jul 28, 2024
2 parents 27bb840 + 4c32820 commit dcea8a0
Show file tree
Hide file tree
Showing 6 changed files with 424 additions and 10 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# HDMF Changelog

## HDMF 3.14.3 (Upcoming)

### Enhancements
- Added new attribute "dimension_labels" on `DatasetBuilder` which specifies the names of the dimensions used in the
dataset based on the shape of the dataset data and the dimension names in the spec for the data type. This attribute
is available on build (during the write process), but not on read of a dataset from a file. @rly [#1081](https://github.com/hdmf-dev/hdmf/pull/1081)

## HDMF 3.14.2 (July 7, 2024)

### Enhancements
Expand Down
21 changes: 21 additions & 0 deletions src/hdmf/backends/hdf5/h5_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,10 +554,31 @@ def __init__(self, **kwargs):

@property
def dataset(self):
"""Get the cached h5py.Dataset."""
return self.__dataset

@dataset.setter
def dataset(self, val):
"""Cache the h5py.Dataset written with the stored IO settings.
This attribute can be used to cache a written, empty dataset and fill it in later.
This allows users to access the handle to the dataset *without* having to close
and reopen a file.
For example::
dataio = H5DataIO(shape=(5,), dtype=int)
foo = Foo('foo1', dataio, "I am foo1", 17, 3.14)
bucket = FooBucket('bucket1', [foo])
foofile = FooFile(buckets=[bucket])
io = HDF5IO(self.path, manager=self.manager, mode='w')
# write the object to disk, including initializing an empty int dataset with shape (5,)
io.write(foofile)
foo.my_data.dataset[:] = [0, 1, 2, 3, 4]
io.close()
"""
if self.__dataset is not None:
raise ValueError("Cannot overwrite H5DataIO.dataset")
self.__dataset = val
Expand Down
16 changes: 14 additions & 2 deletions src/hdmf/build/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,18 +330,25 @@ class DatasetBuilder(BaseBuilder):
'doc': 'The datatype of this dataset.', 'default': None},
{'name': 'attributes', 'type': dict,
'doc': 'A dictionary of attributes to create in this dataset.', 'default': dict()},
{'name': 'dimension_labels', 'type': tuple,
'doc': ('A list of labels for each dimension of this dataset from the spec. Currently this is '
'supplied only on build.'),
'default': None},
{'name': 'maxshape', 'type': (int, tuple),
'doc': 'The shape of this dataset. Use None for scalars.', 'default': None},
{'name': 'chunks', 'type': bool, 'doc': 'Whether or not to chunk this dataset.', 'default': False},
{'name': 'parent', 'type': GroupBuilder, 'doc': 'The parent builder of this builder.', 'default': None},
{'name': 'source', 'type': str, 'doc': 'The source of the data in this builder.', 'default': None})
def __init__(self, **kwargs):
""" Create a Builder object for a dataset """
name, data, dtype, attributes, maxshape, chunks, parent, source = getargs(
'name', 'data', 'dtype', 'attributes', 'maxshape', 'chunks', 'parent', 'source', kwargs)
name, data, dtype, attributes, dimension_labels, maxshape, chunks, parent, source = getargs(
'name', 'data', 'dtype', 'attributes', 'dimension_labels', 'maxshape', 'chunks', 'parent', 'source',
kwargs
)
super().__init__(name, attributes, parent, source)
self['data'] = data
self['attributes'] = _copy.copy(attributes)
self.__dimension_labels = dimension_labels
self.__chunks = chunks
self.__maxshape = maxshape
if isinstance(data, BaseBuilder):
Expand All @@ -361,6 +368,11 @@ def data(self, val):
raise AttributeError("Cannot overwrite data.")
self['data'] = val

@property
def dimension_labels(self):
"""Labels for each dimension of this dataset from the spec."""
return self.__dimension_labels

@property
def chunks(self):
"""Whether or not this dataset is chunked."""
Expand Down
97 changes: 90 additions & 7 deletions src/hdmf/build/objectmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .errors import (BuildError, OrphanContainerBuildError, ReferenceTargetNotBuiltError, ContainerConfigurationError,
ConstructError)
from .manager import Proxy, BuildManager

from .warnings import MissingRequiredBuildWarning, DtypeConversionWarning, IncorrectQuantityBuildWarning
from hdmf.backends.hdf5.h5_utils import H5DataIO

Expand All @@ -19,7 +20,7 @@
from ..query import ReferenceResolver
from ..spec import Spec, AttributeSpec, DatasetSpec, GroupSpec, LinkSpec, RefSpec
from ..spec.spec import BaseStorageSpec
from ..utils import docval, getargs, ExtenderMeta, get_docval
from ..utils import docval, getargs, ExtenderMeta, get_docval, get_data_shape

_const_arg = '__constructor_arg'

Expand Down Expand Up @@ -723,19 +724,34 @@ def build(self, **kwargs):
if not isinstance(container, Data):
msg = "'container' must be of type Data with DatasetSpec"
raise ValueError(msg)
spec_dtype, spec_shape, spec = self.__check_dset_spec(self.spec, spec_ext)
spec_dtype, spec_shape, spec_dims, spec = self.__check_dset_spec(self.spec, spec_ext)
dimension_labels = self.__get_dimension_labels_from_spec(container.data, spec_shape, spec_dims)
if isinstance(spec_dtype, RefSpec):
self.logger.debug("Building %s '%s' as a dataset of references (source: %s)"
% (container.__class__.__name__, container.name, repr(source)))
# create dataset builder with data=None as a placeholder. fill in with refs later
builder = DatasetBuilder(name, data=None, parent=parent, source=source, dtype=spec_dtype.reftype)
builder = DatasetBuilder(
name,
data=None,
parent=parent,
source=source,
dtype=spec_dtype.reftype,
dimension_labels=dimension_labels,
)
manager.queue_ref(self.__set_dataset_to_refs(builder, spec_dtype, spec_shape, container, manager))
elif isinstance(spec_dtype, list):
# a compound dataset
self.logger.debug("Building %s '%s' as a dataset of compound dtypes (source: %s)"
% (container.__class__.__name__, container.name, repr(source)))
# create dataset builder with data=None, dtype=None as a placeholder. fill in with refs later
builder = DatasetBuilder(name, data=None, parent=parent, source=source, dtype=spec_dtype)
builder = DatasetBuilder(
name,
data=None,
parent=parent,
source=source,
dtype=spec_dtype,
dimension_labels=dimension_labels,
)
manager.queue_ref(self.__set_compound_dataset_to_refs(builder, spec, spec_dtype, container,
manager))
else:
Expand All @@ -746,7 +762,14 @@ def build(self, **kwargs):
% (container.__class__.__name__, container.name, repr(source)))
# an unspecified dtype and we were given references
# create dataset builder with data=None as a placeholder. fill in with refs later
builder = DatasetBuilder(name, data=None, parent=parent, source=source, dtype='object')
builder = DatasetBuilder(
name,
data=None,
parent=parent,
source=source,
dtype="object",
dimension_labels=dimension_labels,
)
manager.queue_ref(self.__set_untyped_dataset_to_refs(builder, container, manager))
else:
# a dataset that has no references, pass the conversion off to the convert_dtype method
Expand All @@ -762,7 +785,14 @@ def build(self, **kwargs):
except Exception as ex:
msg = 'could not resolve dtype for %s \'%s\'' % (type(container).__name__, container.name)
raise Exception(msg) from ex
builder = DatasetBuilder(name, bldr_data, parent=parent, source=source, dtype=dtype)
builder = DatasetBuilder(
name,
data=bldr_data,
parent=parent,
source=source,
dtype=dtype,
dimension_labels=dimension_labels,
)

# Add attributes from the specification extension to the list of attributes
all_attrs = self.__spec.attributes + getattr(spec_ext, 'attributes', tuple())
Expand All @@ -781,14 +811,67 @@ def __check_dset_spec(self, orig, ext):
"""
dtype = orig.dtype
shape = orig.shape
dims = orig.dims
spec = orig
if ext is not None:
if ext.dtype is not None:
dtype = ext.dtype
if ext.shape is not None:
shape = ext.shape
dims = ext.dims
spec = ext
return dtype, shape, spec
return dtype, shape, dims, spec

def __get_dimension_labels_from_spec(self, data, spec_shape, spec_dims) -> tuple:
if spec_shape is None or spec_dims is None:
return None
data_shape = get_data_shape(data)
# if shape is a list of allowed shapes, find the index of the shape that matches the data
if isinstance(spec_shape[0], list):
match_shape_inds = list()
for i, s in enumerate(spec_shape):
# skip this shape if it has a different number of dimensions from the data
if len(s) != len(data_shape):
continue
# check each dimension. None means any length is allowed
match = True
for j, d in enumerate(data_shape):
if s[j] is not None and s[j] != d:
match = False
break
if match:
match_shape_inds.append(i)
# use the most specific match -- the one with the fewest Nones
if match_shape_inds:
if len(match_shape_inds) == 1:
return tuple(spec_dims[match_shape_inds[0]])
else:
count_nones = [len([x for x in spec_shape[k] if x is None]) for k in match_shape_inds]
index_min_count = count_nones.index(min(count_nones))
best_match_ind = match_shape_inds[index_min_count]
return tuple(spec_dims[best_match_ind])
else:
# no matches found
msg = "Shape of data does not match any allowed shapes in spec '%s'" % self.spec.path
warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
return None
else:
if len(data_shape) != len(spec_shape):
msg = "Shape of data does not match shape in spec '%s'" % self.spec.path
warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
return None
# check each dimension. None means any length is allowed
match = True
for j, d in enumerate(data_shape):
if spec_shape[j] is not None and spec_shape[j] != d:
match = False
break
if not match:
msg = "Shape of data does not match shape in spec '%s'" % self.spec.path
warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
return None
# shape is a single list of allowed dimension lengths
return tuple(spec_dims)

def __is_reftype(self, data):
if (isinstance(data, AbstractDataChunkIterator) or
Expand Down
7 changes: 7 additions & 0 deletions src/hdmf/build/warnings.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ class IncorrectQuantityBuildWarning(BuildWarning):
pass


class IncorrectDatasetShapeBuildWarning(BuildWarning):
"""
Raised when a dataset has a shape that is not allowed by the spec.
"""
pass


class MissingRequiredBuildWarning(BuildWarning):
"""
Raised when a required field is missing.
Expand Down
Loading

0 comments on commit dcea8a0

Please sign in to comment.