Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check DynamicTableRegion data is in bounds #1168

Draft
wants to merge 1 commit into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@

### Enhancements
- Added support to append to a dataset of references for HDMF-Zarr. @mavaylon1 [#1157](https://github.com/hdmf-dev/hdmf/pull/1157)

- Added a check when setting or adding data to a `DynamicTableRegion` or setting the `table` attribute of a `DynamicTableRegion`
that the data values are in bounds of the linked table. This can be turned off for
`DynamicTableRegion.__init__` using the keyword argument `validate_data=False`. @rly [#1168](https://github.com/hdmf-dev/hdmf/pull/1168)
## HDMF 3.14.3 (July 29, 2024)

### Enhancements
Expand Down
48 changes: 38 additions & 10 deletions src/hdmf/common/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -1360,42 +1360,64 @@ class DynamicTableRegion(VectorData):
'table',
)

MAX_ROWS_TO_VALIDATE_INIT = int(1e3)

@docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorData'},
{'name': 'data', 'type': ('array_data', 'data'),
'doc': 'a dataset where the first dimension is a concatenation of multiple vectors'},
{'name': 'description', 'type': str, 'doc': 'a description of what this region represents'},
{'name': 'table', 'type': DynamicTable,
'doc': 'the DynamicTable this region applies to', 'default': None},
{'name': 'validate_data', 'type': bool,
'doc': 'whether to validate the data is in bounds of the linked table', 'default': True},
allow_positional=AllowPositional.WARNING)
def __init__(self, **kwargs):
t = popargs('table', kwargs)
t, validate_data = popargs('table', 'validate_data', kwargs)
data = getargs('data', kwargs)
if validate_data:
self._validate_index_in_range(data, t)

super().__init__(**kwargs)
self.table = t
if t is not None: # set the table attribute using fields to avoid another validation in the setter
self.fields['table'] = t

def _validate_index_in_range(self, data, table):
"""If the length of data is small, and if data contains an index that is out of bounds, then raise an error.
If the object is being constructed from a file, raise a warning instead to ensure invalid data can still be
read.
"""
if table and len(data) <= self.MAX_ROWS_TO_VALIDATE_INIT:
for val in data:
if val >= len(table) or val < 0:
error_msg = f"DynamicTableRegion index {val} is out of bounds for {type(table)} '{table.name}'."
self._error_on_new_warn_on_construct(error_msg, error_cls=IndexError)
Comment on lines +1390 to +1393
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of iterating through all the values, I think checking for np.max(data) >= len(table) would be more efficient.


@property
def table(self):
"""The DynamicTable this DynamicTableRegion is pointing to"""
return self.fields.get('table')

@table.setter
def table(self, val):
def table(self, table):
"""
Set the table this DynamicTableRegion should be pointing to
Set the table this DynamicTableRegion should be pointing to.

:param val: The DynamicTable this DynamicTableRegion should be pointing to
This will validate all data elements in this DynamicTableRegion to ensure they are within bounds.

:param table: The DynamicTable this DynamicTableRegion should be pointing to

:raises: AttributeError if table is already in fields
:raises: IndexError if the current indices are out of bounds for the new table given by val
"""
if val is None:
if table is None:
return
if 'table' in self.fields:
msg = "can't set attribute 'table' -- already set"
raise AttributeError(msg)
dat = self.data
if isinstance(dat, DataIO):
dat = dat.data
self.fields['table'] = val

self.fields['table'] = table
for val in self.data:
self._validate_new_data_element(val)
Comment on lines +1419 to +1420
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You might want to try this as a list comprehension [self._validate_new_data_element(val) for val in self.data] just because they tend to be faster in Python than regular for loops


def __getitem__(self, arg):
return self.get(arg)
Expand Down Expand Up @@ -1540,6 +1562,12 @@ def _validate_on_set_parent(self):
warn(msg, stacklevel=2)
return super()._validate_on_set_parent()

def _validate_new_data_element(self, arg):
"""Validate that the new index is within bounds of the table. Raises an IndexError if not."""
if self.table and (arg >= len(self.table) or arg < 0):
raise IndexError(f"DynamicTableRegion index {arg} is out of bounds for "
f"{type(self.table)} '{self.table.name}'.")


def _uint_precision(elements):
""" Calculate the uint precision needed to encode a set of elements """
Expand Down
13 changes: 13 additions & 0 deletions src/hdmf/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,19 @@
"""
pass

def _error_on_new_warn_on_construct(self, error_msg: str, error_cls: type = ValueError):
"""Raise a ValueError when a check is violated on instance creation.
To ensure backwards compatibility, this method throws a warning
instead of raising an error when reading from a file, ensuring that
files with invalid data can be read. If error_msg is set to None
the function will simply return without further action.
"""
if error_msg is None:
return

Check warning on line 578 in src/hdmf/container.py

View check run for this annotation

Codecov / codecov/patch

src/hdmf/container.py#L578

Added line #L578 was not covered by tests
if not self._in_construct_mode:
raise error_cls(error_msg)
warn(error_msg)

Check warning on line 581 in src/hdmf/container.py

View check run for this annotation

Codecov / codecov/patch

src/hdmf/container.py#L581

Added line #L581 was not covered by tests


class Container(AbstractContainer):
"""A container that can contain other containers and has special functionality for printing."""
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/common/test_linkedtables.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ def test_to_hierarchical_dataframe_indexed_dtr_on_last_level(self):
p1 = DynamicTable(name='parent_table', description='parent_table',
columns=[VectorData(name='p1', description='p1', data=np.arange(3)), dtr_p1, vi_dtr_p1])
# Super-parent table
dtr_sp = DynamicTableRegion(name='sl1', description='sl1', data=np.arange(4), table=p1)
dtr_sp = DynamicTableRegion(name='sl1', description='sl1', data=[0, 1, 2, 0], table=p1)
vi_dtr_sp = VectorIndex(name='sl1_index', data=[1, 2, 3], target=dtr_sp)
spt = DynamicTable(name='super_parent_table', description='super_parent_table',
columns=[VectorData(name='sp1', description='sp1', data=np.arange(3)), dtr_sp, vi_dtr_sp])
Expand Down
28 changes: 28 additions & 0 deletions tests/unit/common/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -1287,6 +1287,34 @@ def test_no_df_nested(self):
with self.assertRaisesWith(ValueError, msg):
dynamic_table_region.get(0, df=False, index=False)

def test_init_out_of_bounds(self):
table = self.with_columns_and_data()
with self.assertRaises(IndexError):
DynamicTableRegion(name='dtr', data=[0, 1, 2, 2, 5], description='desc', table=table)

def test_init_out_of_bounds_long(self):
table = self.with_columns_and_data()
data = np.ones(DynamicTableRegion.MAX_ROWS_TO_VALIDATE_INIT+1, dtype=int)*5
dtr = DynamicTableRegion(name='dtr', data=data, description='desc', table=table)
assert dtr.data is data # no exception raised

def test_init_out_of_bounds_no_validate(self):
table = self.with_columns_and_data()
dtr = DynamicTableRegion(name='dtr', data=[0, 1, 5], description='desc', table=table, validate_data=False)
self.assertEqual(dtr.data, [0, 1, 5]) # no exception raised

def test_add_row_out_of_bounds(self):
table = self.with_columns_and_data()
dtr = DynamicTableRegion(name='dtr', data=[0, 1, 2, 2], description='desc', table=table)
with self.assertRaises(IndexError):
dtr.add_row(5)

def test_set_table_out_of_bounds(self):
table = self.with_columns_and_data()
dtr = DynamicTableRegion(name='dtr', data=[0, 1, 5], description='desc')
with self.assertRaises(IndexError):
dtr.table = table


class DynamicTableRegionRoundTrip(H5RoundTripMixin, TestCase):

Expand Down
Loading