Skip to content

Commit

Permalink
Derive manifest config for AnVIL from field mapping (#6110)
Browse files Browse the repository at this point in the history
  • Loading branch information
hannes-ucsc committed Mar 31, 2024
1 parent e10f365 commit 8239bce
Show file tree
Hide file tree
Showing 6 changed files with 316 additions and 171 deletions.
4 changes: 2 additions & 2 deletions src/azul/plugins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
EntityType,
FieldPath,
FieldPathElement,
FieldTypes,
IndexName,
)
from azul.indexer.transform import (
Expand Down Expand Up @@ -409,9 +410,8 @@ def facets(self) -> Sequence[str]:
self.source_id_field
]

@property
@abstractmethod
def manifest(self) -> ManifestConfig:
def manifest_config(self, field_types: FieldTypes) -> ManifestConfig:
raise NotImplementedError

@abstractmethod
Expand Down
82 changes: 35 additions & 47 deletions src/azul/plugins/metadata/anvil/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from collections import (
defaultdict,
)
from typing import (
Iterable,
Optional,
Expand All @@ -8,6 +11,9 @@
from azul.indexer.document import (
DocumentType,
EntityType,
FieldPath,
FieldType,
FieldTypes,
IndexName,
)
from azul.plugins import (
Expand Down Expand Up @@ -241,53 +247,35 @@ def facets(self) -> Sequence[str]:
'files.is_supplementary',
]

@property
def manifest(self) -> ManifestConfig:
return {
('sources',): {
'id': 'source_id',
'spec': 'source_spec',
},
('bundles',): {
'uuid': 'bundle_uuid',
'version': 'bundle_version'
},
('contents', 'activities'): {
'document_id': 'activity_document_id',
'activity_type': 'activity_type',
},
('contents', 'biosamples'): {
'document_id': 'biosample_document_id',
'biosample_type': 'biosample_type',
'anatomical_site': 'anatomical_site'
},
('contents', 'datasets'): {
'document_id': 'dataset_document_id',
'dataset_id': 'dataset_id',
'title': 'dataset_title'
},
('contents', 'donors'): {
'phenotypic_sex': 'phenotypic_sex',
'document_id': 'donor_document_id',
'species': 'species',
},
('contents', 'files'): {
'document_id': 'file_document_id',
'name': 'file_name',
'file_format': 'file_format',
'size': 'file_size',
'uuid': 'file_uuid',
'version': 'file_version',
'reference_assembly': 'file_reference_assembly',
'is_supplementary': 'file_is_supplementary',
'data_modality': 'file_data_modality',
'crc32': 'file_crc32',
'sha256': 'file_sha256',
'file_md5sum': 'file_md5',
'drs_uri': 'file_drs_uri',
'file_url': 'file_url'
}
}
def manifest_config(self, field_types: FieldTypes) -> ManifestConfig:
result = defaultdict(dict)
mapping = self._field_mapping

def recurse(types: FieldTypes, path: FieldPath):
for path_element, type_or_types in types.items():
new_path = (*path, path_element)
if isinstance(type_or_types, dict):
recurse(type_or_types, new_path)
elif isinstance(type_or_types, list) or isinstance(type_or_types, FieldType):
if path: # only nested fields
field_name = mapping
try:
for i in new_path:
field_name = field_name[i]
except KeyError:
pass
else:
if new_path == ('contents', 'files', 'uuid'):
result[path]['file_url'] = 'files.file_url'
field_name = 'files.uuid'
elif new_path == ('contents', 'files', 'version'):
field_name = 'files.version'
result[path][path_element] = field_name
else:
assert False, (path, path_element, type_or_types)

recurse(field_types, ())
return result

def document_slice(self, entity_type: str) -> Optional[DocumentSlice]:
return None
Expand Down
4 changes: 2 additions & 2 deletions src/azul/plugins/metadata/hca/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
Aggregate,
DocumentType,
EntityType,
FieldTypes,
IndexName,
)
from azul.plugins import (
Expand Down Expand Up @@ -306,8 +307,7 @@ def facets(self) -> Sequence[str]:
'bionetworkName'
]

@property
def manifest(self) -> ManifestConfig:
def manifest_config(self, field_types: FieldTypes) -> ManifestConfig:
return {
('sources',): {
'id': 'source_id',
Expand Down
3 changes: 2 additions & 1 deletion src/azul/service/manifest_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -869,7 +869,8 @@ def manifest_config(self) -> ManifestConfig:
The manifest config this generator uses. A manifest config is a mapping
from document properties to manifest fields.
"""
return self.service.metadata_plugin(self.catalog).manifest
plugin = self.service.metadata_plugin(self.catalog)
return plugin.manifest_config(self._field_types)

@cached_property
def included_fields(self) -> list[FieldPath] | None:
Expand Down
Loading

0 comments on commit 8239bce

Please sign in to comment.