From 360a902833500ce62b4a36337ebcca574d825ff8 Mon Sep 17 00:00:00 2001
From: Noa Aviel Dove <nadove@ucsc.edu>
Date: Thu, 26 Sep 2024 18:49:03 -0700
Subject: [PATCH] [r] Include parent dataset/project in hub IDs (#6626)

---
 src/azul/indexer/transform.py                 |  3 +-
 .../metadata/anvil/indexer/transform.py       |  7 +++--
 .../plugins/metadata/hca/indexer/transform.py |  8 +++--
 ...2-e274-affe-aabc-eb3db63ad068.results.json | 16 ++++++++--
 ...d.2018-11-02T11:33:44.698028Z.results.json | 31 +++++++++++++------
 test/indexer/test_anvil.py                    |  7 ++++-
 6 files changed, 52 insertions(+), 20 deletions(-)

diff --git a/src/azul/indexer/transform.py b/src/azul/indexer/transform.py
index 1e49c5746..4a2f2c8dc 100644
--- a/src/azul/indexer/transform.py
+++ b/src/azul/indexer/transform.py
@@ -133,6 +133,7 @@ def _contribution(self,
     def _replica(self,
                  entity: EntityReference,
                  *,
+                 root_hub: EntityID,
                  file_hub: EntityID | None,
                  ) -> Replica:
         replica_type, contents = self._replicate(entity)
@@ -144,7 +145,7 @@ def _replica(self,
                        contents=contents,
                        # The other hubs will be added when the indexer
                        # consolidates duplicate replicas.
-                       hub_ids=alist(file_hub))
+                       hub_ids=alist(file_hub, root_hub))
 
     @classmethod
     @abstractmethod
diff --git a/src/azul/plugins/metadata/anvil/indexer/transform.py b/src/azul/plugins/metadata/anvil/indexer/transform.py
index 395d40567..9e2f4f22e 100644
--- a/src/azul/plugins/metadata/anvil/indexer/transform.py
+++ b/src/azul/plugins/metadata/anvil/indexer/transform.py
@@ -585,9 +585,10 @@ def transform(self,
             # will be redundant with those emitted by the file transformer, but
             # these will be coalesced by the index service before they are
             # written to ElasticSearch.
+            dataset = self._only_dataset()
             for entity in chain(self.bundle.orphans, self.bundle.entities):
                 if partition.contains(UUID(entity.entity_id)):
-                    yield self._replica(entity, file_hub=None)
+                    yield self._replica(entity, file_hub=None, root_hub=dataset.entity_id)
 
 
 class DatasetTransformer(SingletonTransformer):
@@ -632,6 +633,7 @@ def _transform(self,
                    entity: EntityReference
                    ) -> Iterable[Contribution | Replica]:
         linked = self._linked_entities(entity)
+        dataset = self._only_dataset()
         contents = dict(
             activities=self._entities(self._activity, chain.from_iterable(
                 linked[activity_type]
@@ -645,7 +647,7 @@ def _transform(self,
         )
         yield self._contribution(contents, entity.entity_id)
         if config.enable_replicas:
-            yield self._replica(entity, file_hub=entity.entity_id)
+            yield self._replica(entity, file_hub=entity.entity_id, root_hub=dataset.entity_id)
             for linked_entity in linked:
                 yield self._replica(
                     linked_entity,
@@ -655,4 +657,5 @@ def _transform(self,
                     # hub IDs field empty for datasets and rely on the tenet
                     # that every file is an implicit hub of its parent dataset.
                     file_hub=None if linked_entity.entity_type == 'anvil_dataset' else entity.entity_id,
+                    root_hub=dataset.entity_id
                 )
diff --git a/src/azul/plugins/metadata/hca/indexer/transform.py b/src/azul/plugins/metadata/hca/indexer/transform.py
index 00380d856..174d43878 100644
--- a/src/azul/plugins/metadata/hca/indexer/transform.py
+++ b/src/azul/plugins/metadata/hca/indexer/transform.py
@@ -1470,17 +1470,19 @@ def _transform(self,
                         for entity_type, values in additional_contents.items():
                             contents[entity_type].extend(values)
                 file_id = file.ref.entity_id
+                project_ref = self._api_project.ref
+                project_id = project_ref.entity_id
                 yield self._contribution(contents, file_id)
                 if config.enable_replicas:
-                    yield self._replica(self.api_bundle.ref, file_hub=file_id)
+                    yield self._replica(self.api_bundle.ref, file_hub=file_id, root_hub=project_id)
                     # Projects are linked to every file in their snapshot,
                     # making an explicit list of hub IDs for the project both
                     # redundant and impractically large. Therefore, we leave the
                     # hub IDs field empty for projects and rely on the tenet
                     # that every file is an implicit hub of its parent project.
-                    yield self._replica(self._api_project.ref, file_hub=None)
+                    yield self._replica(project_ref, file_hub=None, root_hub=project_id)
                     for linked_entity in visitor.entities:
-                        yield self._replica(linked_entity, file_hub=file_id)
+                        yield self._replica(linked_entity, file_hub=file_id, root_hub=project_id)
 
     def matrix_stratification_values(self, file: api.File) -> JSON:
         """
diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json
index 5d3fd8243..2e09718c1 100644
--- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json
+++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json
@@ -437,7 +437,8 @@
             "entity_id": "1509ef40-d1ba-440d-b298-16b7c173dcd4",
             "replica_type": "anvil_sequencingactivity",
             "hub_ids": [
-                "15b76f9c-6b46-433f-851d-34e89f1b9ba6"
+                "15b76f9c-6b46-433f-851d-34e89f1b9ba6",
+                "2370f948-2783-4eb6-afea-e022897f4dcf"
             ],
             "contents": {
                 "activity_type": "Sequencing",
@@ -871,7 +872,8 @@
             "entity_id": "15b76f9c-6b46-433f-851d-34e89f1b9ba6",
             "replica_type": "anvil_file",
             "hub_ids": [
-                "15b76f9c-6b46-433f-851d-34e89f1b9ba6"
+                "15b76f9c-6b46-433f-851d-34e89f1b9ba6",
+                "2370f948-2783-4eb6-afea-e022897f4dcf"
             ],
             "contents": {
                 "data_modality": [],
@@ -904,6 +906,7 @@
             "replica_type": "anvil_diagnosis",
             "hub_ids": [
                 "15b76f9c-6b46-433f-851d-34e89f1b9ba6",
+                "2370f948-2783-4eb6-afea-e022897f4dcf",
                 "3b17377b-16b1-431c-9967-e5d01fc5923f"
             ],
             "contents": {
@@ -1477,7 +1480,9 @@
         "_source": {
             "entity_id": "2370f948-2783-4eb6-afea-e022897f4dcf",
             "replica_type": "anvil_dataset",
-            "hub_ids": [],
+            "hub_ids": [
+                "2370f948-2783-4eb6-afea-e022897f4dcf"
+            ],
             "contents": {
                 "consent_group": [
                     "DS-BDIS"
@@ -1748,6 +1753,7 @@
             },
             "replica_type": "anvil_file",
             "hub_ids": [
+                "2370f948-2783-4eb6-afea-e022897f4dcf",
                 "3b17377b-16b1-431c-9967-e5d01fc5923f"
             ]
         }
@@ -2190,6 +2196,7 @@
             "entity_id": "816e364e-1193-4e5b-a91a-14e4b009157c",
             "replica_type": "anvil_sequencingactivity",
             "hub_ids": [
+                "2370f948-2783-4eb6-afea-e022897f4dcf",
                 "3b17377b-16b1-431c-9967-e5d01fc5923f"
             ],
             "contents": {
@@ -2939,6 +2946,7 @@
             "replica_type": "anvil_biosample",
             "hub_ids": [
                 "15b76f9c-6b46-433f-851d-34e89f1b9ba6",
+                "2370f948-2783-4eb6-afea-e022897f4dcf",
                 "3b17377b-16b1-431c-9967-e5d01fc5923f"
             ],
             "contents": {
@@ -3519,6 +3527,7 @@
             "replica_type": "anvil_diagnosis",
             "hub_ids": [
                 "15b76f9c-6b46-433f-851d-34e89f1b9ba6",
+                "2370f948-2783-4eb6-afea-e022897f4dcf",
                 "3b17377b-16b1-431c-9967-e5d01fc5923f"
             ],
             "contents": {
@@ -4107,6 +4116,7 @@
             "replica_type": "anvil_donor",
             "hub_ids": [
                 "15b76f9c-6b46-433f-851d-34e89f1b9ba6",
+                "2370f948-2783-4eb6-afea-e022897f4dcf",
                 "3b17377b-16b1-431c-9967-e5d01fc5923f"
             ]
         }
diff --git a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json
index d7b65274b..24aab5fdc 100644
--- a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json
+++ b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json
@@ -70,7 +70,8 @@
             "entity_id": "aaa96233-bf27-44c7-82df-b4dc15ad4d9d",
             "hub_ids": [
                 "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb",
-                "70d1af4a-82c8-478a-8960-e9028b3616ca"
+                "70d1af4a-82c8-478a-8960-e9028b3616ca",
+                "e8642221-4c2c-4fd7-b926-a68bce363c88"
             ],
             "replica_type": "links"
         }
@@ -3485,7 +3486,8 @@
             "entity_id": "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb",
             "replica_type": "sequence_file",
             "hub_ids": [
-                "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb"
+                "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb",
+                "e8642221-4c2c-4fd7-b926-a68bce363c88"
             ]
         },
         "_type": "_doc"
@@ -3524,7 +3526,8 @@
             "replica_type": "cell_suspension",
             "hub_ids": [
                 "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb",
-                "70d1af4a-82c8-478a-8960-e9028b3616ca"
+                "70d1af4a-82c8-478a-8960-e9028b3616ca",
+                "e8642221-4c2c-4fd7-b926-a68bce363c88"
             ]
         },
         "_type": "_doc"
@@ -3555,7 +3558,8 @@
             "entity_id": "70d1af4a-82c8-478a-8960-e9028b3616ca",
             "replica_type": "sequence_file",
             "hub_ids": [
-                "70d1af4a-82c8-478a-8960-e9028b3616ca"
+                "70d1af4a-82c8-478a-8960-e9028b3616ca",
+                "e8642221-4c2c-4fd7-b926-a68bce363c88"
             ]
         },
         "_type": "_doc"
@@ -3608,7 +3612,8 @@
             "replica_type": "specimen_from_organism",
             "hub_ids": [
                 "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb",
-                "70d1af4a-82c8-478a-8960-e9028b3616ca"
+                "70d1af4a-82c8-478a-8960-e9028b3616ca",
+                "e8642221-4c2c-4fd7-b926-a68bce363c88"
             ]
         },
         "_type": "_doc"
@@ -3688,7 +3693,9 @@
             },
             "entity_id": "e8642221-4c2c-4fd7-b926-a68bce363c88",
             "replica_type": "project",
-            "hub_ids": []
+            "hub_ids": [
+                "e8642221-4c2c-4fd7-b926-a68bce363c88"
+            ]
         },
         "_type": "_doc"
     },
@@ -3750,7 +3757,8 @@
             "entity_id": "7b07b9d0-cc0e-4098-9f64-f4a569f7d746",
             "hub_ids": [
                 "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb",
-                "70d1af4a-82c8-478a-8960-e9028b3616ca"
+                "70d1af4a-82c8-478a-8960-e9028b3616ca",
+                "e8642221-4c2c-4fd7-b926-a68bce363c88"
             ],
             "replica_type": "donor_organism"
         },
@@ -3793,7 +3801,8 @@
             "entity_id": "9c32cf70-3ed7-4720-badc-5ee71e8a38af",
             "hub_ids": [
                 "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb",
-                "70d1af4a-82c8-478a-8960-e9028b3616ca"
+                "70d1af4a-82c8-478a-8960-e9028b3616ca",
+                "e8642221-4c2c-4fd7-b926-a68bce363c88"
             ],
             "replica_type": "library_preparation_protocol"
         },
@@ -3830,7 +3839,8 @@
             "entity_id": "61e629ed-0135-4492-ac8a-5c4ab3ccca8a",
             "hub_ids": [
                 "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb",
-                "70d1af4a-82c8-478a-8960-e9028b3616ca"
+                "70d1af4a-82c8-478a-8960-e9028b3616ca",
+                "e8642221-4c2c-4fd7-b926-a68bce363c88"
             ],
             "replica_type": "sequencing_protocol"
         },
@@ -3856,7 +3866,8 @@
             "entity_id": "771ddaf6-3a4f-4314-97fe-6294ff8e25a4",
             "hub_ids": [
                 "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb",
-                "70d1af4a-82c8-478a-8960-e9028b3616ca"
+                "70d1af4a-82c8-478a-8960-e9028b3616ca",
+                "e8642221-4c2c-4fd7-b926-a68bce363c88"
             ],
             "replica_type": "process"
         },
diff --git a/test/indexer/test_anvil.py b/test/indexer/test_anvil.py
index ae074c644..d62b03193 100644
--- a/test/indexer/test_anvil.py
+++ b/test/indexer/test_anvil.py
@@ -265,6 +265,11 @@ def test_dataset_description(self):
     def test_orphans(self):
         bundle = self._load_canned_bundle(self.replica_bundle())
         self._index_bundle(bundle)
+        dataset_entity_id = one(
+            ref.entity_id
+            for ref in bundle.orphans
+            if ref.entity_type == 'anvil_dataset'
+        )
         expected = bundle.orphans if config.enable_replicas else {}
         actual = {}
         hits = self._get_all_hits()
@@ -272,7 +277,7 @@ def test_orphans(self):
             qualifier, doc_type = self._parse_index_name(hit)
             self.assertEqual(DocumentType.replica, doc_type)
             source = hit['_source']
-            self.assertEqual(source['hub_ids'], [])
+            self.assertEqual(source['hub_ids'], [dataset_entity_id])
             ref = EntityReference(entity_type=source['replica_type'],
                                   entity_id=source['entity_id'])
             actual[ref] = source['contents']