From 360a902833500ce62b4a36337ebcca574d825ff8 Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Thu, 26 Sep 2024 18:49:03 -0700 Subject: [PATCH] [r] Include parent dataset/project in hub IDs (#6626) --- src/azul/indexer/transform.py | 3 +- .../metadata/anvil/indexer/transform.py | 7 +++-- .../plugins/metadata/hca/indexer/transform.py | 8 +++-- ...2-e274-affe-aabc-eb3db63ad068.results.json | 16 ++++++++-- ...d.2018-11-02T11:33:44.698028Z.results.json | 31 +++++++++++++------ test/indexer/test_anvil.py | 7 ++++- 6 files changed, 52 insertions(+), 20 deletions(-) diff --git a/src/azul/indexer/transform.py b/src/azul/indexer/transform.py index 1e49c5746..4a2f2c8dc 100644 --- a/src/azul/indexer/transform.py +++ b/src/azul/indexer/transform.py @@ -133,6 +133,7 @@ def _contribution(self, def _replica(self, entity: EntityReference, *, + root_hub: EntityID, file_hub: EntityID | None, ) -> Replica: replica_type, contents = self._replicate(entity) @@ -144,7 +145,7 @@ def _replica(self, contents=contents, # The other hubs will be added when the indexer # consolidates duplicate replicas. - hub_ids=alist(file_hub)) + hub_ids=alist(file_hub, root_hub)) @classmethod @abstractmethod diff --git a/src/azul/plugins/metadata/anvil/indexer/transform.py b/src/azul/plugins/metadata/anvil/indexer/transform.py index 395d40567..9e2f4f22e 100644 --- a/src/azul/plugins/metadata/anvil/indexer/transform.py +++ b/src/azul/plugins/metadata/anvil/indexer/transform.py @@ -585,9 +585,10 @@ def transform(self, # will be redundant with those emitted by the file transformer, but # these will be coalesced by the index service before they are # written to ElasticSearch. + dataset = self._only_dataset() for entity in chain(self.bundle.orphans, self.bundle.entities): if partition.contains(UUID(entity.entity_id)): - yield self._replica(entity, file_hub=None) + yield self._replica(entity, file_hub=None, root_hub=dataset.entity_id) class DatasetTransformer(SingletonTransformer): @@ -632,6 +633,7 @@ def _transform(self, entity: EntityReference ) -> Iterable[Contribution | Replica]: linked = self._linked_entities(entity) + dataset = self._only_dataset() contents = dict( activities=self._entities(self._activity, chain.from_iterable( linked[activity_type] @@ -645,7 +647,7 @@ def _transform(self, ) yield self._contribution(contents, entity.entity_id) if config.enable_replicas: - yield self._replica(entity, file_hub=entity.entity_id) + yield self._replica(entity, file_hub=entity.entity_id, root_hub=dataset.entity_id) for linked_entity in linked: yield self._replica( linked_entity, @@ -655,4 +657,5 @@ def _transform(self, # hub IDs field empty for datasets and rely on the tenet # that every file is an implicit hub of its parent dataset. file_hub=None if linked_entity.entity_type == 'anvil_dataset' else entity.entity_id, + root_hub=dataset.entity_id ) diff --git a/src/azul/plugins/metadata/hca/indexer/transform.py b/src/azul/plugins/metadata/hca/indexer/transform.py index 00380d856..174d43878 100644 --- a/src/azul/plugins/metadata/hca/indexer/transform.py +++ b/src/azul/plugins/metadata/hca/indexer/transform.py @@ -1470,17 +1470,19 @@ def _transform(self, for entity_type, values in additional_contents.items(): contents[entity_type].extend(values) file_id = file.ref.entity_id + project_ref = self._api_project.ref + project_id = project_ref.entity_id yield self._contribution(contents, file_id) if config.enable_replicas: - yield self._replica(self.api_bundle.ref, file_hub=file_id) + yield self._replica(self.api_bundle.ref, file_hub=file_id, root_hub=project_id) # Projects are linked to every file in their snapshot, # making an explicit list of hub IDs for the project both # redundant and impractically large. Therefore, we leave the # hub IDs field empty for projects and rely on the tenet # that every file is an implicit hub of its parent project. - yield self._replica(self._api_project.ref, file_hub=None) + yield self._replica(project_ref, file_hub=None, root_hub=project_id) for linked_entity in visitor.entities: - yield self._replica(linked_entity, file_hub=file_id) + yield self._replica(linked_entity, file_hub=file_id, root_hub=project_id) def matrix_stratification_values(self, file: api.File) -> JSON: """ diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index 5d3fd8243..2e09718c1 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -437,7 +437,8 @@ "entity_id": "1509ef40-d1ba-440d-b298-16b7c173dcd4", "replica_type": "anvil_sequencingactivity", "hub_ids": [ - "15b76f9c-6b46-433f-851d-34e89f1b9ba6" + "15b76f9c-6b46-433f-851d-34e89f1b9ba6", + "2370f948-2783-4eb6-afea-e022897f4dcf" ], "contents": { "activity_type": "Sequencing", @@ -871,7 +872,8 @@ "entity_id": "15b76f9c-6b46-433f-851d-34e89f1b9ba6", "replica_type": "anvil_file", "hub_ids": [ - "15b76f9c-6b46-433f-851d-34e89f1b9ba6" + "15b76f9c-6b46-433f-851d-34e89f1b9ba6", + "2370f948-2783-4eb6-afea-e022897f4dcf" ], "contents": { "data_modality": [], @@ -904,6 +906,7 @@ "replica_type": "anvil_diagnosis", "hub_ids": [ "15b76f9c-6b46-433f-851d-34e89f1b9ba6", + "2370f948-2783-4eb6-afea-e022897f4dcf", "3b17377b-16b1-431c-9967-e5d01fc5923f" ], "contents": { @@ -1477,7 +1480,9 @@ "_source": { "entity_id": "2370f948-2783-4eb6-afea-e022897f4dcf", "replica_type": "anvil_dataset", - "hub_ids": [], + "hub_ids": [ + "2370f948-2783-4eb6-afea-e022897f4dcf" + ], "contents": { "consent_group": [ "DS-BDIS" @@ -1748,6 +1753,7 @@ }, "replica_type": "anvil_file", "hub_ids": [ + "2370f948-2783-4eb6-afea-e022897f4dcf", "3b17377b-16b1-431c-9967-e5d01fc5923f" ] } @@ -2190,6 +2196,7 @@ "entity_id": "816e364e-1193-4e5b-a91a-14e4b009157c", "replica_type": "anvil_sequencingactivity", "hub_ids": [ + "2370f948-2783-4eb6-afea-e022897f4dcf", "3b17377b-16b1-431c-9967-e5d01fc5923f" ], "contents": { @@ -2939,6 +2946,7 @@ "replica_type": "anvil_biosample", "hub_ids": [ "15b76f9c-6b46-433f-851d-34e89f1b9ba6", + "2370f948-2783-4eb6-afea-e022897f4dcf", "3b17377b-16b1-431c-9967-e5d01fc5923f" ], "contents": { @@ -3519,6 +3527,7 @@ "replica_type": "anvil_diagnosis", "hub_ids": [ "15b76f9c-6b46-433f-851d-34e89f1b9ba6", + "2370f948-2783-4eb6-afea-e022897f4dcf", "3b17377b-16b1-431c-9967-e5d01fc5923f" ], "contents": { @@ -4107,6 +4116,7 @@ "replica_type": "anvil_donor", "hub_ids": [ "15b76f9c-6b46-433f-851d-34e89f1b9ba6", + "2370f948-2783-4eb6-afea-e022897f4dcf", "3b17377b-16b1-431c-9967-e5d01fc5923f" ] } diff --git a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json index d7b65274b..24aab5fdc 100644 --- a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json +++ b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json @@ -70,7 +70,8 @@ "entity_id": "aaa96233-bf27-44c7-82df-b4dc15ad4d9d", "hub_ids": [ "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", - "70d1af4a-82c8-478a-8960-e9028b3616ca" + "70d1af4a-82c8-478a-8960-e9028b3616ca", + "e8642221-4c2c-4fd7-b926-a68bce363c88" ], "replica_type": "links" } @@ -3485,7 +3486,8 @@ "entity_id": "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", "replica_type": "sequence_file", "hub_ids": [ - "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb" + "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", + "e8642221-4c2c-4fd7-b926-a68bce363c88" ] }, "_type": "_doc" @@ -3524,7 +3526,8 @@ "replica_type": "cell_suspension", "hub_ids": [ "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", - "70d1af4a-82c8-478a-8960-e9028b3616ca" + "70d1af4a-82c8-478a-8960-e9028b3616ca", + "e8642221-4c2c-4fd7-b926-a68bce363c88" ] }, "_type": "_doc" @@ -3555,7 +3558,8 @@ "entity_id": "70d1af4a-82c8-478a-8960-e9028b3616ca", "replica_type": "sequence_file", "hub_ids": [ - "70d1af4a-82c8-478a-8960-e9028b3616ca" + "70d1af4a-82c8-478a-8960-e9028b3616ca", + "e8642221-4c2c-4fd7-b926-a68bce363c88" ] }, "_type": "_doc" @@ -3608,7 +3612,8 @@ "replica_type": "specimen_from_organism", "hub_ids": [ "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", - "70d1af4a-82c8-478a-8960-e9028b3616ca" + "70d1af4a-82c8-478a-8960-e9028b3616ca", + "e8642221-4c2c-4fd7-b926-a68bce363c88" ] }, "_type": "_doc" @@ -3688,7 +3693,9 @@ }, "entity_id": "e8642221-4c2c-4fd7-b926-a68bce363c88", "replica_type": "project", - "hub_ids": [] + "hub_ids": [ + "e8642221-4c2c-4fd7-b926-a68bce363c88" + ] }, "_type": "_doc" }, @@ -3750,7 +3757,8 @@ "entity_id": "7b07b9d0-cc0e-4098-9f64-f4a569f7d746", "hub_ids": [ "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", - "70d1af4a-82c8-478a-8960-e9028b3616ca" + "70d1af4a-82c8-478a-8960-e9028b3616ca", + "e8642221-4c2c-4fd7-b926-a68bce363c88" ], "replica_type": "donor_organism" }, @@ -3793,7 +3801,8 @@ "entity_id": "9c32cf70-3ed7-4720-badc-5ee71e8a38af", "hub_ids": [ "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", - "70d1af4a-82c8-478a-8960-e9028b3616ca" + "70d1af4a-82c8-478a-8960-e9028b3616ca", + "e8642221-4c2c-4fd7-b926-a68bce363c88" ], "replica_type": "library_preparation_protocol" }, @@ -3830,7 +3839,8 @@ "entity_id": "61e629ed-0135-4492-ac8a-5c4ab3ccca8a", "hub_ids": [ "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", - "70d1af4a-82c8-478a-8960-e9028b3616ca" + "70d1af4a-82c8-478a-8960-e9028b3616ca", + "e8642221-4c2c-4fd7-b926-a68bce363c88" ], "replica_type": "sequencing_protocol" }, @@ -3856,7 +3866,8 @@ "entity_id": "771ddaf6-3a4f-4314-97fe-6294ff8e25a4", "hub_ids": [ "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", - "70d1af4a-82c8-478a-8960-e9028b3616ca" + "70d1af4a-82c8-478a-8960-e9028b3616ca", + "e8642221-4c2c-4fd7-b926-a68bce363c88" ], "replica_type": "process" }, diff --git a/test/indexer/test_anvil.py b/test/indexer/test_anvil.py index ae074c644..d62b03193 100644 --- a/test/indexer/test_anvil.py +++ b/test/indexer/test_anvil.py @@ -265,6 +265,11 @@ def test_dataset_description(self): def test_orphans(self): bundle = self._load_canned_bundle(self.replica_bundle()) self._index_bundle(bundle) + dataset_entity_id = one( + ref.entity_id + for ref in bundle.orphans + if ref.entity_type == 'anvil_dataset' + ) expected = bundle.orphans if config.enable_replicas else {} actual = {} hits = self._get_all_hits() @@ -272,7 +277,7 @@ def test_orphans(self): qualifier, doc_type = self._parse_index_name(hit) self.assertEqual(DocumentType.replica, doc_type) source = hit['_source'] - self.assertEqual(source['hub_ids'], []) + self.assertEqual(source['hub_ids'], [dataset_entity_id]) ref = EntityReference(entity_type=source['replica_type'], entity_id=source['entity_id']) actual[ref] = source['contents']