From 6a77227b025e27513e43f7a5f07614caa84b5ec3 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Thu, 12 Sep 2024 15:57:51 -0700 Subject: [PATCH] Fix: Invalid columns in compact manifest for AnVIL (#6110) --- src/azul/plugins/metadata/anvil/__init__.py | 16 +++++++++++++++- .../plugins/metadata/anvil/service/response.py | 3 +++ test/service/test_manifest.py | 16 +++++----------- 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/__init__.py b/src/azul/plugins/metadata/anvil/__init__.py index aed275cad7..18d5a657ab 100644 --- a/src/azul/plugins/metadata/anvil/__init__.py +++ b/src/azul/plugins/metadata/anvil/__init__.py @@ -133,6 +133,9 @@ def _field_mapping(self) -> MetadataPlugin._FieldMapping: 'document_id', 'source_datarepo_row_ids' ] + # Note that there is a brittle coupling that must be maintained between + # the keys here and those used in the custom field name lookup in + # `self.manifest_config`. return { 'entity_id': 'entryId', 'bundles': { @@ -277,9 +280,20 @@ def manifest_config(self) -> ManifestConfig: # the field mapping. Keys are field paths in an ES hit, and values are # the desired manifest column name, or None to omit the column from the # manifest. + # + # Note that there is a brittle coupling that must be maintained between + # the keys here and those used in `self._field_mapping`. Also, the + # values (that aren't `None`) should match the related field's path in + # a response hit from the `/index/files` endpoint. + # custom_field_names = { + ('bundles', 'uuid'): 'bundles.bundle_uuid', + ('bundles', 'version'): 'bundles.bundle_version', + ('contents', 'activities', 'activity_table'): None, ('contents', 'files', 'uuid'): None, ('contents', 'files', 'version'): None, + ('sources', 'id'): 'sources.source_id', + ('sources', 'spec'): 'sources.source_spec', } def recurse(mapping: MetadataPlugin._FieldMapping, path: FieldPath): @@ -302,7 +316,7 @@ def recurse(mapping: MetadataPlugin._FieldMapping, path: FieldPath): # The file URL is synthesized from the `uuid` and `version` fields. # Above, we already configured these two fields to be omitted from the # manifest since they are not informative to the user. - result[('contents', 'files')]['file_url'] = 'files.file_url' + result[('contents', 'files')]['file_url'] = 'files.azul_file_url' return result def verbatim_pfb_schema(self, diff --git a/src/azul/plugins/metadata/anvil/service/response.py b/src/azul/plugins/metadata/anvil/service/response.py index 0d938ced65..6c29287b51 100644 --- a/src/azul/plugins/metadata/anvil/service/response.py +++ b/src/azul/plugins/metadata/anvil/service/response.py @@ -139,6 +139,9 @@ def choose_entry(_term): def _make_hit(self, es_hit: JSON) -> MutableJSON: return { 'entryId': es_hit['entity_id'], + # Note that there is a brittle coupling that must be maintained + # between the `sources` and `bundles` field paths and the values in + # the custom field name lookup in `Plugin.manifest_config`. 'sources': list(map(self._make_source, es_hit['sources'])), 'bundles': list(map(self._make_bundle, es_hit['bundles'])), **self._make_contents(es_hit['contents']) diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py index 92d0b5546f..da3bf4b3a5 100644 --- a/test/service/test_manifest.py +++ b/test/service/test_manifest.py @@ -1716,25 +1716,25 @@ def test_compact_manifest(self): self.assertEqual(200, response.status_code) expected = [ ( - 'bundle_uuid', + 'bundles.bundle_uuid', '6b0f6c0f-5d80-a242-accb-840921351cd5', '826dea02-e274-affe-aabc-eb3db63ad068', '826dea02-e274-affe-aabc-eb3db63ad068' ), ( - 'bundle_version', + 'bundles.bundle_version', '2022-06-01T00:00:00.000000Z', '2022-06-01T00:00:00.000000Z', '2022-06-01T00:00:00.000000Z' ), ( - 'source_id', + 'sources.source_id', '6c87f0e1-509d-46a4-b845-7584df39263b', '6c87f0e1-509d-46a4-b845-7584df39263b', '6c87f0e1-509d-46a4-b845-7584df39263b' ), ( - 'source_spec', + 'sources.source_spec', 'tdr:bigquery:gcp:test_anvil_project:anvil_snapshot:/2', 'tdr:bigquery:gcp:test_anvil_project:anvil_snapshot:/2', 'tdr:bigquery:gcp:test_anvil_project:anvil_snapshot:/2' @@ -1973,12 +1973,6 @@ def test_compact_manifest(self): '18b3be87-e26b-4376-0d8d-c1e370e90e07', 'a60c5138-3749-f7cb-8714-52d389ad5231' ), - ( - 'activities.activity_table', - '', - 'sequencingactivity', - 'sequencingactivity' - ), ( 'activities.activity_type', '', @@ -2082,7 +2076,7 @@ def test_compact_manifest(self): self._drs_uri('v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37') ), ( - 'files.file_url', + 'files.azul_file_url', self._file_url('6b0f6c0f-5d80-4242-accb-840921351cd5', self.version), self._file_url('15b76f9c-6b46-433f-851d-34e89f1b9ba6', self.version), self._file_url('3b17377b-16b1-431c-9967-e5d01fc5923f', self.version)