Skip to content

Commit

Permalink
[r] Include parent dataset/project in hub IDs (#6626)
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Nov 3, 2024
1 parent 6d20dae commit 23bcf50
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 20 deletions.
3 changes: 2 additions & 1 deletion src/azul/indexer/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def _contribution(self,
def _replica(self,
entity: EntityReference,
*,
root_hub: EntityID,
file_hub: EntityID | None,
) -> Replica:
replica_type, contents = self._replicate(entity)
Expand All @@ -144,7 +145,7 @@ def _replica(self,
contents=contents,
# The other hubs will be added when the indexer
# consolidates duplicate replicas.
hub_ids=alist(file_hub))
hub_ids=alist(file_hub, root_hub))

@classmethod
@abstractmethod
Expand Down
7 changes: 5 additions & 2 deletions src/azul/plugins/metadata/anvil/indexer/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,9 +585,10 @@ def transform(self,
# will be redundant with those emitted by the file transformer, but
# these will be coalesced by the index service before they are
# written to ElasticSearch.
dataset = self._only_dataset()
for entity in chain(self.bundle.orphans, self.bundle.entities):
if partition.contains(UUID(entity.entity_id)):
yield self._replica(entity, file_hub=None)
yield self._replica(entity, file_hub=None, root_hub=dataset.entity_id)


class DatasetTransformer(SingletonTransformer):
Expand Down Expand Up @@ -632,6 +633,7 @@ def _transform(self,
entity: EntityReference
) -> Iterable[Contribution | Replica]:
linked = self._linked_entities(entity)
dataset = self._only_dataset()
contents = dict(
activities=self._entities(self._activity, chain.from_iterable(
linked[activity_type]
Expand All @@ -645,7 +647,7 @@ def _transform(self,
)
yield self._contribution(contents, entity.entity_id)
if config.enable_replicas:
yield self._replica(entity, file_hub=entity.entity_id)
yield self._replica(entity, file_hub=entity.entity_id, root_hub=dataset.entity_id)
for linked_entity in linked:
yield self._replica(
linked_entity,
Expand All @@ -655,4 +657,5 @@ def _transform(self,
# hub IDs field empty for datasets and rely on the tenet
# that every file is an implicit hub of its parent dataset.
file_hub=None if linked_entity.entity_type == 'anvil_dataset' else entity.entity_id,
root_hub=dataset.entity_id
)
8 changes: 5 additions & 3 deletions src/azul/plugins/metadata/hca/indexer/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -1470,17 +1470,19 @@ def _transform(self,
for entity_type, values in additional_contents.items():
contents[entity_type].extend(values)
file_id = file.ref.entity_id
project_ref = self._api_project.ref
project_id = project_ref.entity_id
yield self._contribution(contents, file_id)
if config.enable_replicas:
yield self._replica(self.api_bundle.ref, file_hub=file_id)
yield self._replica(self.api_bundle.ref, file_hub=file_id, root_hub=project_id)
# Projects are linked to every file in their snapshot,
# making an explicit list of hub IDs for the project both
# redundant and impractically large. Therefore, we leave the
# hub IDs field empty for projects and rely on the tenet
# that every file is an implicit hub of its parent project.
yield self._replica(self._api_project.ref, file_hub=None)
yield self._replica(project_ref, file_hub=None, root_hub=project_id)
for linked_entity in visitor.entities:
yield self._replica(linked_entity, file_hub=file_id)
yield self._replica(linked_entity, file_hub=file_id, root_hub=project_id)

def matrix_stratification_values(self, file: api.File) -> JSON:
"""
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 6 additions & 1 deletion test/indexer/test_anvil.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,14 +265,19 @@ def test_dataset_description(self):
def test_orphans(self):
bundle = self._load_canned_bundle(self.replica_bundle())
self._index_bundle(bundle)
dataset_entity_id = one(
ref.entity_id
for ref in bundle.orphans
if ref.entity_type == 'anvil_dataset'
)
expected = bundle.orphans if config.enable_replicas else {}
actual = {}
hits = self._get_all_hits()
for hit in hits:
qualifier, doc_type = self._parse_index_name(hit)
self.assertEqual(DocumentType.replica, doc_type)
source = hit['_source']
self.assertEqual(source['hub_ids'], [])
self.assertEqual(source['hub_ids'], [dataset_entity_id])
ref = EntityReference(entity_type=source['replica_type'],
entity_id=source['entity_id'])
actual[ref] = source['contents']
Expand Down

0 comments on commit 23bcf50

Please sign in to comment.