diff --git a/src/azul/plugins/__init__.py b/src/azul/plugins/__init__.py index 893499fb2..2fab31a4e 100644 --- a/src/azul/plugins/__init__.py +++ b/src/azul/plugins/__init__.py @@ -413,13 +413,12 @@ def special_fields(self) -> SpecialFields: @property @abstractmethod - def implicit_hub_type(self) -> str: + def hot_entity_types(self) -> AbstractSet[str]: """ - The type of entities that do not explicitly track their hubs in replica + The types of entities that do not explicitly track their hubs in replica documents in order to avoid a large list of hub references in the replica document, and to avoid contention when updating that list during - indexing. Note that this is not a type of hub entities, but rather the - type of replica entities that have implicit hubs. + indexing. """ raise NotImplementedError diff --git a/src/azul/plugins/metadata/anvil/__init__.py b/src/azul/plugins/metadata/anvil/__init__.py index b3fe44532..3290dd57a 100644 --- a/src/azul/plugins/metadata/anvil/__init__.py +++ b/src/azul/plugins/metadata/anvil/__init__.py @@ -5,6 +5,7 @@ itemgetter, ) from typing import ( + AbstractSet, Iterable, Sequence, ) @@ -249,8 +250,8 @@ def special_fields(self) -> SpecialFields: bundle_version='bundle_version') @property - def implicit_hub_type(self) -> str: - return 'datasets' + def hot_entity_types(self) -> AbstractSet[str]: + return {'datasets'} @property def facets(self) -> Sequence[str]: diff --git a/src/azul/plugins/metadata/hca/__init__.py b/src/azul/plugins/metadata/hca/__init__.py index 0286a2b48..85f4ce1c7 100644 --- a/src/azul/plugins/metadata/hca/__init__.py +++ b/src/azul/plugins/metadata/hca/__init__.py @@ -1,4 +1,5 @@ from typing import ( + AbstractSet, Iterable, Sequence, TYPE_CHECKING, @@ -285,8 +286,8 @@ def special_fields(self) -> SpecialFields: bundle_version='bundleVersion') @property - def implicit_hub_type(self) -> str: - return 'projects' + def hot_entity_types(self) -> AbstractSet[str]: + return {'projects'} @property def facets(self) -> Sequence[str]: diff --git a/src/azul/service/manifest_service.py b/src/azul/service/manifest_service.py index dc3ad7ba0..5739c0381 100644 --- a/src/azul/service/manifest_service.py +++ b/src/azul/service/manifest_service.py @@ -48,6 +48,7 @@ ) import time from typing import ( + AbstractSet, IO, Protocol, Self, @@ -1992,12 +1993,15 @@ def included_fields(self) -> list[FieldPath]: # "keys" used for the join. return [ ('entity_id',), - ('contents', self.implicit_hub_type, 'document_id') + *( + ('contents', entity_type, 'document_id') + for entity_type in self.hot_entity_types + ) ] @property - def implicit_hub_type(self) -> str: - return self.service.metadata_plugin(self.catalog).implicit_hub_type + def hot_entity_types(self) -> AbstractSet[str]: + return self.service.metadata_plugin(self.catalog).hot_entity_types @attrs.frozen(kw_only=True) class ReplicaKeys: @@ -2011,14 +2015,17 @@ class ReplicaKeys: or the replica's entity ID. """ hub_id: str - replica_id: str + replica_ids: list[str] def _replica_keys(self) -> Iterable[ReplicaKeys]: - hub_type = self.implicit_hub_type request = self._create_request() for hit in request.scan(): yield self.ReplicaKeys(hub_id=hit['entity_id'], - replica_id=one(one(hit['contents'][hub_type])['document_id'])) + replica_ids=[ + document_id + for entity_type in self.hot_entity_types + for document_id in hit['contents'][entity_type]['document_id'] + ]) def _all_replicas(self) -> Iterable[JSON]: emitted_replica_ids = set() @@ -2053,7 +2060,7 @@ def _join_replicas(self, keys: Iterable[ReplicaKeys]) -> Iterable[Hit]: hub_ids, replica_ids = set(), set() for key in keys: hub_ids.add(key.hub_id) - replica_ids.add(key.replica_id) + replica_ids.update(key.replica_ids) request = request.query(Q('bool', should=[ {'terms': {'hub_ids.keyword': list(hub_ids)}}, {'terms': {'entity_id.keyword': list(replica_ids)}}