Skip to content

Commit

Permalink
Rename implicit hubs to hot entity types
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Nov 8, 2024
1 parent c47df02 commit efb58a0
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 15 deletions.
7 changes: 3 additions & 4 deletions src/azul/plugins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,13 +413,12 @@ def special_fields(self) -> SpecialFields:

@property
@abstractmethod
def implicit_hub_type(self) -> str:
def hot_entity_types(self) -> AbstractSet[str]:
"""
The type of entities that do not explicitly track their hubs in replica
The types of entities that do not explicitly track their hubs in replica
documents in order to avoid a large list of hub references in the
replica document, and to avoid contention when updating that list during
indexing. Note that this is not a type of hub entities, but rather the
type of replica entities that have implicit hubs.
indexing.
"""
raise NotImplementedError

Expand Down
5 changes: 3 additions & 2 deletions src/azul/plugins/metadata/anvil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
itemgetter,
)
from typing import (
AbstractSet,
Iterable,
Sequence,
)
Expand Down Expand Up @@ -249,8 +250,8 @@ def special_fields(self) -> SpecialFields:
bundle_version='bundle_version')

@property
def implicit_hub_type(self) -> str:
return 'datasets'
def hot_entity_types(self) -> AbstractSet[str]:
return {'datasets'}

@property
def facets(self) -> Sequence[str]:
Expand Down
5 changes: 3 additions & 2 deletions src/azul/plugins/metadata/hca/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import (
AbstractSet,
Iterable,
Sequence,
TYPE_CHECKING,
Expand Down Expand Up @@ -285,8 +286,8 @@ def special_fields(self) -> SpecialFields:
bundle_version='bundleVersion')

@property
def implicit_hub_type(self) -> str:
return 'projects'
def hot_entity_types(self) -> AbstractSet[str]:
return {'projects'}

@property
def facets(self) -> Sequence[str]:
Expand Down
21 changes: 14 additions & 7 deletions src/azul/service/manifest_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
)
import time
from typing import (
AbstractSet,
IO,
Protocol,
Self,
Expand Down Expand Up @@ -1992,12 +1993,15 @@ def included_fields(self) -> list[FieldPath]:
# "keys" used for the join.
return [
('entity_id',),
('contents', self.implicit_hub_type, 'document_id')
*(
('contents', entity_type, 'document_id')
for entity_type in self.hot_entity_types
)
]

@property
def implicit_hub_type(self) -> str:
return self.service.metadata_plugin(self.catalog).implicit_hub_type
def hot_entity_types(self) -> AbstractSet[str]:
return self.service.metadata_plugin(self.catalog).hot_entity_types

@attrs.frozen(kw_only=True)
class ReplicaKeys:
Expand All @@ -2011,14 +2015,17 @@ class ReplicaKeys:
or the replica's entity ID.
"""
hub_id: str
replica_id: str
replica_ids: list[str]

def _replica_keys(self) -> Iterable[ReplicaKeys]:
hub_type = self.implicit_hub_type
request = self._create_request()
for hit in request.scan():
yield self.ReplicaKeys(hub_id=hit['entity_id'],
replica_id=one(one(hit['contents'][hub_type])['document_id']))
replica_ids=[
document_id
for entity_type in self.hot_entity_types
for document_id in hit['contents'][entity_type]['document_id']
])

def _all_replicas(self) -> Iterable[JSON]:
emitted_replica_ids = set()
Expand Down Expand Up @@ -2053,7 +2060,7 @@ def _join_replicas(self, keys: Iterable[ReplicaKeys]) -> Iterable[Hit]:
hub_ids, replica_ids = set(), set()
for key in keys:
hub_ids.add(key.hub_id)
replica_ids.add(key.replica_id)
replica_ids.update(key.replica_ids)
request = request.query(Q('bool', should=[
{'terms': {'hub_ids.keyword': list(hub_ids)}},
{'terms': {'entity_id.keyword': list(replica_ids)}}
Expand Down

0 comments on commit efb58a0

Please sign in to comment.