[r] Include parent dataset/project in hub IDs (#6626)

DataBiosphere · Nov 3, 2024 · 23bcf50 · 23bcf50
1 parent 6d20dae
commit 23bcf50
Show file tree

Hide file tree

Showing 6 changed files with 52 additions and 20 deletions.
diff --git a/src/azul/indexer/transform.py b/src/azul/indexer/transform.py
@@ -133,6 +133,7 @@ def _contribution(self,
     def _replica(self,
                  entity: EntityReference,
                  *,
+                 root_hub: EntityID,
                  file_hub: EntityID | None,
                  ) -> Replica:
         replica_type, contents = self._replicate(entity)
@@ -144,7 +145,7 @@ def _replica(self,
                        contents=contents,
                        # The other hubs will be added when the indexer
                        # consolidates duplicate replicas.
-                       hub_ids=alist(file_hub))
+                       hub_ids=alist(file_hub, root_hub))
 
     @classmethod
     @abstractmethod

diff --git a/src/azul/plugins/metadata/anvil/indexer/transform.py b/src/azul/plugins/metadata/anvil/indexer/transform.py
@@ -585,9 +585,10 @@ def transform(self,
             # will be redundant with those emitted by the file transformer, but
             # these will be coalesced by the index service before they are
             # written to ElasticSearch.
+            dataset = self._only_dataset()
             for entity in chain(self.bundle.orphans, self.bundle.entities):
                 if partition.contains(UUID(entity.entity_id)):
-                    yield self._replica(entity, file_hub=None)
+                    yield self._replica(entity, file_hub=None, root_hub=dataset.entity_id)
 
 
 class DatasetTransformer(SingletonTransformer):
@@ -632,6 +633,7 @@ def _transform(self,
                    entity: EntityReference
                    ) -> Iterable[Contribution | Replica]:
         linked = self._linked_entities(entity)
+        dataset = self._only_dataset()
         contents = dict(
             activities=self._entities(self._activity, chain.from_iterable(
                 linked[activity_type]
@@ -645,7 +647,7 @@ def _transform(self,
         )
         yield self._contribution(contents, entity.entity_id)
         if config.enable_replicas:
-            yield self._replica(entity, file_hub=entity.entity_id)
+            yield self._replica(entity, file_hub=entity.entity_id, root_hub=dataset.entity_id)
             for linked_entity in linked:
                 yield self._replica(
                     linked_entity,
@@ -655,4 +657,5 @@ def _transform(self,
                     # hub IDs field empty for datasets and rely on the tenet
                     # that every file is an implicit hub of its parent dataset.
                     file_hub=None if linked_entity.entity_type == 'anvil_dataset' else entity.entity_id,
+                    root_hub=dataset.entity_id
                 )
diff --git a/src/azul/plugins/metadata/hca/indexer/transform.py b/src/azul/plugins/metadata/hca/indexer/transform.py
@@ -1470,17 +1470,19 @@ def _transform(self,
                         for entity_type, values in additional_contents.items():
                             contents[entity_type].extend(values)
                 file_id = file.ref.entity_id
+                project_ref = self._api_project.ref
+                project_id = project_ref.entity_id
                 yield self._contribution(contents, file_id)
                 if config.enable_replicas:
-                    yield self._replica(self.api_bundle.ref, file_hub=file_id)
+                    yield self._replica(self.api_bundle.ref, file_hub=file_id, root_hub=project_id)
                     # Projects are linked to every file in their snapshot,
                     # making an explicit list of hub IDs for the project both
                     # redundant and impractically large. Therefore, we leave the
                     # hub IDs field empty for projects and rely on the tenet
                     # that every file is an implicit hub of its parent project.
-                    yield self._replica(self._api_project.ref, file_hub=None)
+                    yield self._replica(project_ref, file_hub=None, root_hub=project_id)
                     for linked_entity in visitor.entities:
-                        yield self._replica(linked_entity, file_hub=file_id)
+                        yield self._replica(linked_entity, file_hub=file_id, root_hub=project_id)
 
     def matrix_stratification_values(self, file: api.File) -> JSON:
         """

diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json
diff --git a/...ndexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json b/...ndexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json
diff --git a/test/indexer/test_anvil.py b/test/indexer/test_anvil.py
@@ -265,14 +265,19 @@ def test_dataset_description(self):
     def test_orphans(self):
         bundle = self._load_canned_bundle(self.replica_bundle())
         self._index_bundle(bundle)
+        dataset_entity_id = one(
+            ref.entity_id
+            for ref in bundle.orphans
+            if ref.entity_type == 'anvil_dataset'
+        )
         expected = bundle.orphans if config.enable_replicas else {}
         actual = {}
         hits = self._get_all_hits()
         for hit in hits:
             qualifier, doc_type = self._parse_index_name(hit)
             self.assertEqual(DocumentType.replica, doc_type)
             source = hit['_source']
-            self.assertEqual(source['hub_ids'], [])
+            self.assertEqual(source['hub_ids'], [dataset_entity_id])
             ref = EntityReference(entity_type=source['replica_type'],
                                   entity_id=source['entity_id'])
             actual[ref] = source['contents']