Skip to content

Commit

Permalink
Clarify and enforce bundle FQID equality semantics (#6671)
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Oct 31, 2024
1 parent 24aaa42 commit ca8cd51
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 41 deletions.
115 changes: 98 additions & 17 deletions src/azul/indexer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
Self,
TypeVar,
TypedDict,
final,
)

import attrs
Expand Down Expand Up @@ -45,21 +46,105 @@
BundleVersion = str


@attrs.frozen(kw_only=True, order=True)
@attrs.frozen(kw_only=True, eq=False)
class BundleFQID(SupportsLessAndGreaterThan):
"""
>>> list(sorted([
... BundleFQID(uuid='d', version='e'),
... BundleFQID(uuid='a', version='c'),
... BundleFQID(uuid='a', version='b'),
... ]))
... # doctest: +NORMALIZE_WHITESPACE
[BundleFQID(uuid='a', version='b'),
BundleFQID(uuid='a', version='c'),
BundleFQID(uuid='d', version='e')]
A fully qualified bundle identifier. The attributes defined in this class
must always be sufficient to decide whether two instances of this class or
its subclasses identify the same bundle or not. Subclasses may define
additional attributes to help describe the bundle, but they are forbidden
from using these attributes in the implementations of their `__eq__` or
`__hash__` methods, either explicitly or in code generated by `attrs`.
"""
uuid: BundleUUID = attrs.field(order=str.lower)
version: BundleVersion = attrs.field(order=str.lower)
uuid: BundleUUID
version: BundleVersion

def _nucleus(self) -> tuple[str, str]:
return self.uuid.lower(), self.version.lower()

# We can't use attrs' generated implementation because it always
# considers operands with different types to be unequal, regardless of
# their inheritance relationships or how their attributes are annotated
# (e.g. specifying `eq=False` has no effect). We want instances of
# all subclasses to compare equal as long as `uuid` and `version` are
# equal.
@final
def __eq__(self, other: 'BundleFQID') -> bool:
"""
>>> b1 = BundleFQID(uuid='a', version='b')
>>> b2 = BundleFQID(uuid='a', version='b')
>>> b1 == b2
True
>>> s1 = SourceRef(id='x', spec=SimpleSourceSpec.parse('y:/0'))
>>> sb1 = SourcedBundleFQID(uuid='a', version='b', source=s1)
>>> sb2 = SourcedBundleFQID(uuid='a', version='b', source=s1)
>>> sb1 == sb2
True
>>> b1 == sb1
True
>>> s2 = SourceRef(id='w', spec=SimpleSourceSpec.parse('z:/0'))
>>> sb3 = SourcedBundleFQID(uuid='a', version='b', source=s2)
>>> b1 == sb3
True
>>> sb1 == sb3
... # doctest: +NORMALIZE_WHITESPACE
Traceback (most recent call last):
...
AssertionError: (('a', 'b'),
SourceRef(id='x', spec=SimpleSourceSpec(prefix=Prefix(common='', partition=0), name='y')),
SourceRef(id='w', spec=SimpleSourceSpec(prefix=Prefix(common='', partition=0), name='z')))
"""
same_bundle = self._nucleus() == other._nucleus()
if (
same_bundle
and isinstance(self, SourcedBundleFQID)
and isinstance(other, SourcedBundleFQID)
):
assert self.source == other.source, (self._nucleus(), self.source, other.source)
return same_bundle

@final
def __hash__(self) -> int:
return hash(self._nucleus())

def __init_subclass__(cls, **kwargs):
"""
>>> @attrs.frozen(kw_only=True)
... class FooBundleFQID(SourcedBundleFQID):
... foo: str
Traceback (most recent call last):
...
AssertionError: <class 'azul.indexer.FooBundleFQID'>
>>> @attrs.frozen(kw_only=True, eq=False)
... class FooBundleFQID(SourcedBundleFQID):
... foo: str
"""
super().__init_subclass__(**kwargs)
assert cls.__eq__ is BundleFQID.__eq__, cls
assert cls.__hash__ is BundleFQID.__hash__, cls

# attrs doesn't allow `order=True` when `eq=False`
def __lt__(self, other: 'BundleFQID') -> bool:
"""
>>> list(sorted([
... BundleFQID(uuid='d', version='e'),
... BundleFQID(uuid='a', version='c'),
... BundleFQID(uuid='a', version='b'),
... ]))
... # doctest: +NORMALIZE_WHITESPACE
[BundleFQID(uuid='a', version='b'),
BundleFQID(uuid='a', version='c'),
BundleFQID(uuid='d', version='e')]
"""
return self._nucleus() < other._nucleus()

def __gt__(self, other: 'BundleFQID'):
return self._nucleus() > other._nucleus()

def to_json(self) -> MutableJSON:
return attrs.asdict(self, recurse=False)
Expand Down Expand Up @@ -462,7 +547,7 @@ class SourcedBundleFQIDJSON(BundleFQIDJSON):
BUNDLE_FQID = TypeVar('BUNDLE_FQID', bound='SourcedBundleFQID')


@attrs.frozen(kw_only=True, order=True)
@attrs.frozen(kw_only=True, eq=False)
class SourcedBundleFQID(BundleFQID, Generic[SOURCE_REF]):
"""
>>> spec = SimpleSourceSpec(name='', prefix=(Prefix(partition=0)))
Expand Down Expand Up @@ -493,10 +578,6 @@ def from_json(cls, json: SourcedBundleFQIDJSON) -> 'SourcedBundleFQID':
source = cls.source_ref_cls().from_json(json.pop('source'))
return cls(source=source, **json)

def upcast(self) -> BundleFQID:
return BundleFQID(uuid=self.uuid,
version=self.version)

def to_json(self) -> SourcedBundleFQIDJSON:
return dict(super().to_json(),
source=self.source.to_json())
Expand Down
17 changes: 0 additions & 17 deletions src/azul/indexer/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,23 +519,6 @@ class ContributionCoordinates(DocumentCoordinates[E], Generic[E]):

deleted: bool

def __attrs_post_init__(self):
# If we were to allow instances of subclasses, we'd risk breaking
# equality and hashing semantics. It is impossible to correctly
# implement the transitivity property of equality between instances of
# type and subtype without ignoring the additional attributes added by
# the subtype. Consider types T and S where S is a subtype of T.
# Transitivity requires that s1 == s2 for any two instances s1 and s2
# of S for which s1 == t and s2 == t, where t is any instance of T. The
# subtype instances s1 and s2 can only be equal to t if they match in
# all attributes that T defines. The additional attributes introduced
# by S must be ignored, even when comparing s1 and s2, otherwise s1 and
# s2 might turn out to be unequal. In this particular case that is not
# desirable: we do want to be able to compare instances of subtypes of
# BundleFQID without ignoring any of their attributes.
concrete_type = type(self.bundle)
assert concrete_type is BundleFQID, concrete_type

@property
def document_id(self) -> str:
return '_'.join((
Expand Down
2 changes: 1 addition & 1 deletion src/azul/indexer/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def _contribution(self,
) -> Contribution:
entity = EntityReference(entity_type=self.entity_type(), entity_id=entity_id)
coordinates = ContributionCoordinates(entity=entity,
bundle=self.bundle.fqid.upcast(),
bundle=self.bundle.fqid,
deleted=self.deleted)
return Contribution(coordinates=coordinates,
version=None,
Expand Down
2 changes: 1 addition & 1 deletion src/azul/plugins/repository/tdr_anvil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ class TDRAnvilBundleFQIDJSON(SourcedBundleFQIDJSON):
table_name: str


@attrs.frozen(kw_only=True)
@attrs.frozen(kw_only=True, eq=False)
class TDRAnvilBundleFQID(TDRBundleFQID):
table_name: BundleType = attrs.field(converter=BundleType)

Expand Down
8 changes: 4 additions & 4 deletions test/indexer/test_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def test_deletion(self):
self.assertNotEqual(doc.contents, {})
elif doc_type is DocumentType.contribution:
doc = Contribution.from_index(field_types, hit)
self.assertEqual(bundle_fqid.upcast(), doc.coordinates.bundle)
self.assertEqual(bundle_fqid, doc.coordinates.bundle)
self.assertFalse(doc.coordinates.deleted)
elif doc_type is DocumentType.replica:
pass
Expand All @@ -315,7 +315,7 @@ def test_deletion(self):
if doc_type is DocumentType.contribution:
doc = Contribution.from_index(field_types, hit)
docs_by_entity[doc.entity].append(doc)
self.assertEqual(bundle_fqid.upcast(), doc.coordinates.bundle)
self.assertEqual(bundle_fqid, doc.coordinates.bundle)
else:
# Since there is only one bundle and it was deleted,
# nothing should be aggregated
Expand Down Expand Up @@ -388,7 +388,7 @@ def test_duplicate_notification(self):
coordinates = [
ContributionCoordinates(
entity=entity,
bundle=bundle.fqid.upcast(),
bundle=bundle.fqid,
deleted=False
).with_catalog(self.catalog)
]
Expand Down Expand Up @@ -553,7 +553,7 @@ def test_multi_entity_contributing_bundles(self):
entity_id=old_file_uuid,
entity_type='files')
deletion = ContributionCoordinates(entity=entity,
bundle=bundle_fqid.upcast(),
bundle=bundle_fqid,
deleted=True)
index_name, document_id = deletion.index_name, deletion.document_id
hits = [
Expand Down
2 changes: 1 addition & 1 deletion test/indexer/test_projects.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def test_no_duplicate_files_in_specimen(self):
coordinates = AggregateCoordinates(entity=entity)
else:
coordinates = ContributionCoordinates(entity=entity,
bundle=bundle_fqid.upcast(),
bundle=bundle_fqid,
deleted=False)
result = self.es_client.get(index=coordinates.index_name,
id=coordinates.document_id)
Expand Down

0 comments on commit ca8cd51

Please sign in to comment.