Skip to content

Commit

Permalink
[r] Include parent dataset/project in hub IDs (#6626)
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Oct 17, 2024
1 parent e0185c2 commit f3bc4ff
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 22 deletions.
3 changes: 2 additions & 1 deletion src/azul/indexer/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def _contribution(self,
def _replica(self,
entity: EntityReference,
*,
root_hub: EntityID,
file_hub: EntityID | None,
) -> Replica:
replica_type, contents = self._replicate(entity)
Expand All @@ -144,7 +145,7 @@ def _replica(self,
contents=contents,
# The other hubs will be added when the indexer
# consolidates duplicate replicas.
hub_ids=alist(file_hub))
hub_ids=alist(file_hub, root_hub))

@classmethod
@abstractmethod
Expand Down
11 changes: 7 additions & 4 deletions src/azul/plugins/metadata/anvil/indexer/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,9 +567,10 @@ def transform(self,
partition: BundlePartition
) -> Iterable[Contribution | Replica]:
yield from super().transform(partition)
dataset = self._only_dataset()
for orphan in self.bundle.orphans:
if partition.contains(UUID(orphan.entity_id)):
yield self._replica(orphan, file_hub=None)
yield self._replica(orphan, file_hub=None, root_hub=dataset.entity_id)


class DatasetTransformer(SingletonTransformer):
Expand All @@ -586,7 +587,7 @@ def _transform(self,
) -> Iterable[Contribution | Replica]:
yield from super()._transform(entity)
if self._is_duos(entity):
yield self._replica(entity, file_hub=None)
yield self._replica(entity, file_hub=None, root_hub=entity.entity_id)


class DonorTransformer(BaseTransformer):
Expand Down Expand Up @@ -621,20 +622,21 @@ def _transform(self,
entity: EntityReference
) -> Iterable[Contribution | Replica]:
linked = self._linked_entities(entity)
dataset = self._only_dataset()
contents = dict(
activities=self._entities(self._activity, chain.from_iterable(
linked[activity_type]
for activity_type in self._activity_polymorphic_types
)),
biosamples=self._entities(self._biosample, linked['biosample']),
datasets=[self._dataset(self._only_dataset())],
datasets=[self._dataset(dataset)],
diagnoses=self._entities(self._diagnosis, linked['diagnosis']),
donors=self._entities(self._donor, linked['donor']),
files=[self._file(entity)],
)
yield self._contribution(contents, entity.entity_id)
if config.enable_replicas:
yield self._replica(entity, file_hub=entity.entity_id)
yield self._replica(entity, file_hub=entity.entity_id, root_hub=dataset.entity_id)
for linked_entity in linked:
yield self._replica(
linked_entity,
Expand All @@ -644,4 +646,5 @@ def _transform(self,
# hub IDs field empty for datasets and rely on the tenet
# that every file is an implicit hub of its parent dataset.
file_hub=None if linked_entity.entity_type == 'dataset' else entity.entity_id,
root_hub=dataset.entity_id
)
8 changes: 5 additions & 3 deletions src/azul/plugins/metadata/hca/indexer/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -1470,17 +1470,19 @@ def _transform(self,
for entity_type, values in additional_contents.items():
contents[entity_type].extend(values)
file_id = file.ref.entity_id
project_ref = self._api_project.ref
project_id = project_ref.entity_id
yield self._contribution(contents, file_id)
if config.enable_replicas:
yield self._replica(self.api_bundle.ref, file_hub=file_id)
yield self._replica(self.api_bundle.ref, file_hub=file_id, root_hub=project_id)
# Projects are linked to every file in their snapshot,
# making an explicit list of hub IDs for the project both
# redundant and impractically large. Therefore, we leave the
# hub IDs field empty for projects and rely on the tenet
# that every file is an implicit hub of its parent project.
yield self._replica(self._api_project.ref, file_hub=None)
yield self._replica(project_ref, file_hub=None, root_hub=project_id)
for linked_entity in visitor.entities:
yield self._replica(linked_entity, file_hub=file_id)
yield self._replica(linked_entity, file_hub=file_id, root_hub=project_id)

def matrix_stratification_values(self, file: api.File) -> JSON:
"""
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 6 additions & 1 deletion test/indexer/test_anvil.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,14 +259,19 @@ def test_dataset_description(self):
def test_orphans(self):
bundle = self._load_canned_bundle(self.replica_bundle())
self._index_bundle(bundle)
dataset_entity_id = one(
ref.entity_id
for ref in bundle.orphans
if ref.entity_type == 'dataset'
)
expected = bundle.orphans if config.enable_replicas else {}
actual = {}
hits = self._get_all_hits()
for hit in hits:
qualifier, doc_type = self._parse_index_name(hit)
self.assertEqual(DocumentType.replica, doc_type)
source = hit['_source']
self.assertEqual(source['hub_ids'], [])
self.assertEqual(source['hub_ids'], [dataset_entity_id])
ref = EntityReference(entity_type=source['replica_type'].removeprefix('anvil_'),
entity_id=source['entity_id'])
actual[ref] = source['contents']
Expand Down

0 comments on commit f3bc4ff

Please sign in to comment.