Skip to content

Commit

Permalink
Fix: Invalid columns in compact manifest for AnVIL (#6110, PR #6566)
Browse files Browse the repository at this point in the history
  • Loading branch information
dsotirho-ucsc committed Oct 17, 2024
2 parents 16ec5f9 + a5eee6d commit 5ba46d5
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 25 deletions.
45 changes: 37 additions & 8 deletions src/azul/plugins/metadata/anvil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,14 @@ def _field_mapping(self) -> MetadataPlugin._FieldMapping:
return {
'entity_id': 'entryId',
'bundles': {
# These field paths have a brittle coupling that must be
# maintained to the field lookups in `self.manifest_config`.
'uuid': self.special_fields.bundle_uuid,
'version': self.special_fields.bundle_version
},
'sources': {
# These field paths have a brittle coupling that must be
# maintained to the field lookups in `self.manifest_config`.
'id': self.special_fields.source_id,
'spec': self.special_fields.source_spec
},
Expand Down Expand Up @@ -196,6 +200,9 @@ def _field_mapping(self) -> MetadataPlugin._FieldMapping:
f: f'activities.{f}' for f in [
*common_fields,
'activity_id',
# This field path has a brittle coupling that must be
# maintained to the field lookup in
# `self.manifest_config`.
'activity_table',
'activity_type',
'assay_type',
Expand All @@ -222,7 +229,9 @@ def _field_mapping(self) -> MetadataPlugin._FieldMapping:
]
},
# These field names are hard-coded in the implementation of
# the repository service/controller.
# the repository service/controller. Also, these field paths
# have a brittle coupling that must be maintained to the
# field lookups in `self.manifest_config`.
**{
# Not in schema
'version': 'fileVersion',
Expand Down Expand Up @@ -273,6 +282,23 @@ def facets(self) -> Sequence[str]:
def manifest_config(self) -> ManifestConfig:
result = defaultdict(dict)

# Note that there is a brittle coupling that must be maintained between
# the fields listed here and those used in `self._field_mapping`.
fields_to_omit_from_manifest = [
('contents', 'activities', 'activity_table'),
('contents', 'files', 'uuid'),
('contents', 'files', 'version'),
]

# Furthermore, renamed values should match the field's path in a
# response hit from the `/index/files` endpoint.
fields_to_rename_in_manifest = {
('bundles', 'uuid'): 'bundles.bundle_uuid',
('bundles', 'version'): 'bundles.bundle_version',
('sources', 'id'): 'sources.source_id',
('sources', 'spec'): 'sources.source_spec',
}

def recurse(mapping: MetadataPlugin._FieldMapping, path: FieldPath):
for path_element, name_or_type in mapping.items():
new_path = (*path, path_element)
Expand All @@ -281,20 +307,23 @@ def recurse(mapping: MetadataPlugin._FieldMapping, path: FieldPath):
elif isinstance(name_or_type, str):
if new_path == ('entity_id',):
pass
elif new_path == ('contents', 'files', 'uuid'):
# Request the injection of a file URL …
result[path]['file_url'] = 'files.file_url'
# … but suppress the columns for the fields …
result[path][path_element] = None
elif new_path == ('contents', 'files', 'version'):
# … only used by that injection.
elif new_path in fields_to_omit_from_manifest:
result[path][path_element] = None
fields_to_omit_from_manifest.remove(new_path)
elif new_path in fields_to_rename_in_manifest:
result[path][path_element] = fields_to_rename_in_manifest.pop(new_path)
else:
result[path][path_element] = name_or_type
else:
assert False, (path, path_element, name_or_type)

recurse(self._field_mapping, ())
assert len(fields_to_omit_from_manifest) == 0, fields_to_omit_from_manifest
assert len(fields_to_rename_in_manifest) == 0, fields_to_rename_in_manifest
# The file URL is synthesized from the `uuid` and `version` fields.
# Above, we already configured these two fields to be omitted from the
# manifest since they are not informative to the user.
result[('contents', 'files')]['file_url'] = 'files.azul_file_url'
return result

def verbatim_pfb_schema(self,
Expand Down
3 changes: 3 additions & 0 deletions src/azul/plugins/metadata/anvil/service/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ def choose_entry(_term):
def _make_hit(self, es_hit: JSON) -> MutableJSON:
return {
'entryId': es_hit['entity_id'],
# Note that there is a brittle coupling that must be maintained
# between the `sources` and `bundles` field paths here and the
# renamed fields in `Plugin.manifest_config`.
'sources': list(map(self._make_source, es_hit['sources'])),
'bundles': list(map(self._make_bundle, es_hit['bundles'])),
**self._make_contents(es_hit['contents'])
Expand Down
21 changes: 15 additions & 6 deletions test/integration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -749,6 +749,14 @@ def _project_type(self, catalog: CatalogName) -> EntityType:
else:
assert False, catalog

def _uuid_column_name(self, catalog: CatalogName) -> str:
if config.is_hca_enabled(catalog):
return 'bundle_uuid'
elif config.is_anvil_enabled(catalog):
return 'bundles.bundle_uuid'
else:
assert False, catalog

def _test_dos_and_drs(self, catalog: CatalogName):
if config.is_dss_enabled(catalog) and config.dss_direct_access:
_, file = self._get_one_inner_file(catalog)
Expand Down Expand Up @@ -927,8 +935,8 @@ def _assertResponseStatus(self,
)
)

def _check_compact_manifest(self, _catalog: CatalogName, response: bytes):
self.__check_csv_manifest(BytesIO(response), 'bundle_uuid')
def _check_compact_manifest(self, catalog: CatalogName, response: bytes):
self.__check_csv_manifest(BytesIO(response), self._uuid_column_name(catalog))

def _check_terra_bdbag_manifest(self, catalog: CatalogName, response: bytes):
with ZipFile(BytesIO(response)) as zip_fh:
Expand Down Expand Up @@ -1047,14 +1055,14 @@ def _read_csv_manifest(self, file: IO[bytes]) -> csv.DictReader:

def __check_csv_manifest(self,
file: IO[bytes],
uuid_field_name: str
uuid_column_name: str
) -> list[Mapping[str, str]]:
reader = self._read_csv_manifest(file)
rows = list(reader)
log.info(f'Manifest contains {len(rows)} rows.')
self.assertGreater(len(rows), 0)
self.assertIn(uuid_field_name, reader.fieldnames)
bundle_uuids = rows[0][uuid_field_name].split(ManifestGenerator.padded_joiner)
self.assertIn(uuid_column_name, reader.fieldnames)
bundle_uuids = rows[0][uuid_column_name].split(ManifestGenerator.padded_joiner)
self.assertGreater(len(bundle_uuids), 0)
for bundle_uuid in bundle_uuids:
self.assertEqual(bundle_uuid, str(uuid.UUID(bundle_uuid)))
Expand Down Expand Up @@ -1605,9 +1613,10 @@ def bundle_uuids(hit: JSON) -> set[str]:
def test_compact_manifest(expected_bundles):
manifest = BytesIO(self._get_url_content(PUT, manifest_url))
manifest_rows = self._read_csv_manifest(manifest)
uuid_column_name = self._uuid_column_name(catalog)
all_found_bundles = set()
for row in manifest_rows:
row_bundles = set(row['bundle_uuid'].split(ManifestGenerator.padded_joiner))
row_bundles = set(row[uuid_column_name].split(ManifestGenerator.padded_joiner))
# It's possible for one file to be present in multiple
# bundles (e.g. due to stitching), so each row may include
# additional bundles besides those included in the filters.
Expand Down
16 changes: 5 additions & 11 deletions test/service/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1716,25 +1716,25 @@ def test_compact_manifest(self):
self.assertEqual(200, response.status_code)
expected = [
(
'bundle_uuid',
'bundles.bundle_uuid',
'6b0f6c0f-5d80-a242-accb-840921351cd5',
'826dea02-e274-affe-aabc-eb3db63ad068',
'826dea02-e274-affe-aabc-eb3db63ad068'
),
(
'bundle_version',
'bundles.bundle_version',
'2022-06-01T00:00:00.000000Z',
'2022-06-01T00:00:00.000000Z',
'2022-06-01T00:00:00.000000Z'
),
(
'source_id',
'sources.source_id',
'6c87f0e1-509d-46a4-b845-7584df39263b',
'6c87f0e1-509d-46a4-b845-7584df39263b',
'6c87f0e1-509d-46a4-b845-7584df39263b'
),
(
'source_spec',
'sources.source_spec',
'tdr:bigquery:gcp:test_anvil_project:anvil_snapshot:/2',
'tdr:bigquery:gcp:test_anvil_project:anvil_snapshot:/2',
'tdr:bigquery:gcp:test_anvil_project:anvil_snapshot:/2'
Expand Down Expand Up @@ -1973,12 +1973,6 @@ def test_compact_manifest(self):
'18b3be87-e26b-4376-0d8d-c1e370e90e07',
'a60c5138-3749-f7cb-8714-52d389ad5231'
),
(
'activities.activity_table',
'',
'sequencingactivity',
'sequencingactivity'
),
(
'activities.activity_type',
'',
Expand Down Expand Up @@ -2082,7 +2076,7 @@ def test_compact_manifest(self):
self._drs_uri('v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37')
),
(
'files.file_url',
'files.azul_file_url',
self._file_url('6b0f6c0f-5d80-4242-accb-840921351cd5', self.version),
self._file_url('15b76f9c-6b46-433f-851d-34e89f1b9ba6', self.version),
self._file_url('3b17377b-16b1-431c-9967-e5d01fc5923f', self.version)
Expand Down

0 comments on commit 5ba46d5

Please sign in to comment.