Skip to content

Commit

Permalink
[a A] Fix: Invalid columns in compact manifest for AnVIL (#6110, PR #…
Browse files Browse the repository at this point in the history
  • Loading branch information
dsotirho-ucsc committed Apr 3, 2024
2 parents a74ed8f + 251945f commit 3ddf36e
Show file tree
Hide file tree
Showing 16 changed files with 841 additions and 418 deletions.
4 changes: 2 additions & 2 deletions lambdas/service/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@
# changes and reset the minor version to zero. Otherwise, increment only
# the minor version for backwards compatible changes. A backwards
# compatible change is one that does not require updates to clients.
'version': '4.1'
'version': '7.2'
},
'tags': [
{
Expand Down Expand Up @@ -631,7 +631,7 @@ def validate_filters(filters):
raise BRE(f'The `filters` parameter entry for `{field}` '
f'must be a single-item dictionary')
else:
if field == app.metadata_plugin.source_id_field:
if field == app.metadata_plugin.special_fields.source_id:
valid_relations = ('is',)
else:
valid_relations = ('is', 'contains', 'within', 'intersects')
Expand Down
2 changes: 1 addition & 1 deletion lambdas/service/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"info": {
"title": "azul_service",
"description": "\n# Overview\n\nAzul is a REST web service for querying metadata associated with\nboth experimental and analysis data from a data repository. In order\nto deliver response times that make it suitable for interactive use\ncases, the set of metadata properties that it exposes for sorting,\nfiltering, and aggregation is limited. Azul provides a uniform view\nof the metadata over a range of diverse schemas, effectively\nshielding clients from changes in the schemas as they occur over\ntime. It does so, however, at the expense of detail in the set of\nmetadata properties it exposes and in the accuracy with which it\naggregates them.\n\nAzul denormalizes and aggregates metadata into several different\nindices for selected entity types. Metadata entities can be queried\nusing the [Index](#operations-tag-Index) endpoints.\n\nA set of indices forms a catalog. There is a default catalog called\n`dcp2` which will be used unless a\ndifferent catalog name is specified using the `catalog` query\nparameter. Metadata from different catalogs is completely\nindependent: a response obtained by querying one catalog does not\nnecessarily correlate to a response obtained by querying another\none. Two catalogs can contain metadata from the same sources or\ndifferent sources. It is only guaranteed that the body of a\nresponse by any given endpoint adheres to one schema,\nindependently of which catalog was specified in the request.\n\nAzul provides the ability to download data and metadata via the\n[Manifests](#operations-tag-Manifests) endpoints. The\n`curl` format manifests can be used to\ndownload data files. Other formats provide various views of the\nmetadata. Manifests can be generated for a selection of files using\nfilters. These filters are interchangeable with the filters used by\nthe [Index](#operations-tag-Index) endpoints.\n\nAzul also provides a [summary](#operations-Index-get_index_summary)\nview of indexed data.\n\n## Data model\n\nAny index, when queried, returns a JSON array of hits. Each hit\nrepresents a metadata entity. Nested in each hit is a summary of the\nproperties of entities associated with the hit. An entity is\nassociated either by a direct edge in the original metadata graph,\nor indirectly as a series of edges. The nested properties are\ngrouped by the type of the associated entity. The properties of all\ndata files associated with a particular sample, for example, are\nlisted under `hits[*].files` in a `/index/samples` response. It is\nimportant to note that while each _hit_ represents a discrete\nentity, the properties nested within that hit are the result of an\naggregation over potentially many associated entities.\n\nTo illustrate this, consider a data file that is part of two\nprojects (a project is a group of related experiments, typically by\none laboratory, institution or consortium). Querying the `files`\nindex for this file yields a hit looking something like:\n\n```\n{\n \"projects\": [\n {\n \"projectTitle\": \"Project One\"\n \"laboratory\": ...,\n ...\n },\n {\n \"projectTitle\": \"Project Two\"\n \"laboratory\": ...,\n ...\n }\n ],\n \"files\": [\n {\n \"format\": \"pdf\",\n \"name\": \"Team description.pdf\",\n ...\n }\n ]\n}\n```\n\nThis example hit contains two kinds of nested entities (a hit in an\nactual Azul response will contain more): There are the two projects\nentities, and the file itself. These nested entities contain\nselected metadata properties extracted in a consistent way. This\nmakes filtering and sorting simple.\n\nAlso notice that there is only one file. When querying a particular\nindex, the corresponding entity will always be a singleton like\nthis.\n",
"version": "4.1"
"version": "7.2"
},
"tags": [
{
Expand Down
18 changes: 12 additions & 6 deletions src/azul/plugins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@

FieldMapping = Mapping[FieldName, FieldPath]

ColumnMapping = Mapping[FieldPathElement, FieldName]
ColumnMapping = Mapping[FieldPathElement, FieldName | None]
ManifestConfig = Mapping[FieldPath, ColumnMapping]
MutableColumnMapping = dict[FieldPathElement, FieldName]
MutableManifestConfig = dict[FieldPath, MutableColumnMapping]
Expand Down Expand Up @@ -128,6 +128,14 @@ def order(self) -> str:
return 'desc' if self.descending else 'asc'


@attr.s(auto_attribs=True, frozen=True, kw_only=True)
class SpecialFields:
source_id: FieldName
source_spec: FieldName
bundle_uuid: FieldName
bundle_version: FieldName


class ManifestFormat(Enum):
compact = 'compact'
terra_bdbag = 'terra.bdbag'
Expand Down Expand Up @@ -401,7 +409,7 @@ def _field_mapping(self) -> _FieldMapping:

@property
@abstractmethod
def source_id_field(self) -> str:
def special_fields(self) -> SpecialFields:
raise NotImplementedError

@property
Expand All @@ -418,13 +426,11 @@ def implicit_hub_type(self) -> str:

@property
def facets(self) -> Sequence[str]:
return [
self.source_id_field
]
return [self.special_fields.source_id]

@property
@abstractmethod
def manifest(self) -> ManifestConfig:
def manifest_config(self) -> ManifestConfig:
raise NotImplementedError

@abstractmethod
Expand Down
164 changes: 76 additions & 88 deletions src/azul/plugins/metadata/anvil/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from collections import (
defaultdict,
)
from typing import (
Iterable,
Optional,
Expand All @@ -12,13 +15,15 @@
from azul.indexer.document import (
DocumentType,
EntityType,
FieldPath,
IndexName,
)
from azul.plugins import (
DocumentSlice,
ManifestConfig,
MetadataPlugin,
Sorting,
SpecialFields,
)
from azul.plugins.metadata.anvil.bundle import (
AnvilBundle,
Expand Down Expand Up @@ -59,7 +64,7 @@ def exposed_indices(self) -> dict[EntityType, Sorting]:
return dict(
activities=Sorting(field_name='activities.activity_id'),
biosamples=Sorting(field_name='biosamples.biosample_id'),
bundles=Sorting(field_name='bundleUuid'),
bundles=Sorting(field_name=self.special_fields.bundle_uuid),
datasets=Sorting(field_name='datasets.dataset_id'),
donors=Sorting(field_name='donors.donor_id'),
files=Sorting(field_name='files.file_id'),
Expand Down Expand Up @@ -121,37 +126,35 @@ def _field_mapping(self) -> MetadataPlugin._FieldMapping:
return {
'entity_id': 'entryId',
'bundles': {
'uuid': 'bundleUuid',
'version': 'bundleVersion'
'uuid': self.special_fields.bundle_uuid,
'version': self.special_fields.bundle_version
},
'sources': {
'id': self.source_id_field,
'spec': 'sourceSpec'
'id': self.special_fields.source_id,
'spec': self.special_fields.source_spec
},
'contents': {
'activities': {
f: f'activities.{f}' for f in [
'datasets': {
f: f'datasets.{f}' for f in [
*common_fields,
'activity_id',
'activity_table',
'activity_type',
'assay_type',
'dataset_id',
'consent_group',
'data_use_permission',
'owner',
'principal_investigator',
'registered_identifier',
'title',
'data_modality',
'reference_assembly',
# Not in schema
'date_created',
]
},
'biosamples': {
f: f'biosamples.{f}' for f in [
'donors': {
f: f'donors.{f}' for f in [
*common_fields,
'biosample_id',
'anatomical_site',
'apriori_cell_type',
'biosample_type',
'disease',
'donor_age_at_collection_unit',
'donor_age_at_collection',
'donor_id',
'organism_type',
'phenotypic_sex',
'reported_ethnicity',
'genetic_ancestry',
]
},
'diagnoses': {
Expand All @@ -167,27 +170,29 @@ def _field_mapping(self) -> MetadataPlugin._FieldMapping:
'phenopacket'
]
},
'datasets': {
f: f'datasets.{f}' for f in [
'biosamples': {
f: f'biosamples.{f}' for f in [
*common_fields,
'dataset_id',
'consent_group',
'data_use_permission',
'owner',
'principal_investigator',
'registered_identifier',
'title',
'data_modality',
'biosample_id',
'anatomical_site',
'apriori_cell_type',
'biosample_type',
'disease',
'donor_age_at_collection_unit',
'donor_age_at_collection',
]
},
'donors': {
f: f'donors.{f}' for f in [
'activities': {
f: f'activities.{f}' for f in [
*common_fields,
'donor_id',
'organism_type',
'phenotypic_sex',
'reported_ethnicity',
'genetic_ancestry',
'activity_id',
'activity_table',
'activity_type',
'assay_type',
'data_modality',
'reference_assembly',
# Not in schema
'date_created',
]
},
'files': {
Expand Down Expand Up @@ -220,8 +225,11 @@ def _field_mapping(self) -> MetadataPlugin._FieldMapping:
}

@property
def source_id_field(self) -> str:
return 'sourceId'
def special_fields(self) -> SpecialFields:
return SpecialFields(source_id='source_id',
source_spec='source_spec',
bundle_uuid='bundle_uuid',
bundle_version='bundle_version')

@property
def implicit_hub_type(self) -> str:
Expand Down Expand Up @@ -254,52 +262,32 @@ def facets(self) -> Sequence[str]:
]

@property
def manifest(self) -> ManifestConfig:
return {
('sources',): {
'id': 'source_id',
'spec': 'source_spec',
},
('bundles',): {
'uuid': 'bundle_uuid',
'version': 'bundle_version'
},
('contents', 'activities'): {
'document_id': 'activity_document_id',
'activity_type': 'activity_type',
},
('contents', 'biosamples'): {
'document_id': 'biosample_document_id',
'biosample_type': 'biosample_type',
'anatomical_site': 'anatomical_site'
},
('contents', 'datasets'): {
'document_id': 'dataset_document_id',
'dataset_id': 'dataset_id',
'title': 'dataset_title'
},
('contents', 'donors'): {
'phenotypic_sex': 'phenotypic_sex',
'document_id': 'donor_document_id',
'species': 'species',
},
('contents', 'files'): {
'document_id': 'file_document_id',
'name': 'file_name',
'file_format': 'file_format',
'size': 'file_size',
'uuid': 'file_uuid',
'version': 'file_version',
'reference_assembly': 'file_reference_assembly',
'is_supplementary': 'file_is_supplementary',
'data_modality': 'file_data_modality',
'crc32': 'file_crc32',
'sha256': 'file_sha256',
'file_md5sum': 'file_md5',
'drs_uri': 'file_drs_uri',
'file_url': 'file_url'
}
}
def manifest_config(self) -> ManifestConfig:
result = defaultdict(dict)

def recurse(mapping: MetadataPlugin._FieldMapping, path: FieldPath):
for path_element, name_or_type in mapping.items():
new_path = (*path, path_element)
if isinstance(name_or_type, dict):
recurse(name_or_type, new_path)
elif isinstance(name_or_type, str):
if new_path == ('entity_id',):
pass
elif new_path == ('contents', 'files', 'uuid'):
# Request the injection of a file URL …
result[path]['file_url'] = 'files.file_url'
# … but suppress the columns for the fields …
result[path][path_element] = None
elif new_path == ('contents', 'files', 'version'):
# … only used by that injection.
result[path][path_element] = None
else:
result[path][path_element] = name_or_type
else:
assert False, (path, path_element, name_or_type)

recurse(self._field_mapping, ())
return result

def document_slice(self, entity_type: str) -> Optional[DocumentSlice]:
return None
Expand Down
15 changes: 11 additions & 4 deletions src/azul/plugins/metadata/anvil/service/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
from azul.json import (
copy_json,
)
from azul.plugins import (
SpecialFields,
)
from azul.service.elasticsearch_service import (
ResponseTriple,
)
Expand Down Expand Up @@ -143,14 +146,18 @@ def _make_hit(self, es_hit: JSON) -> MutableJSON:

def _make_source(self, es_source: JSON) -> MutableJSON:
return {
'sourceSpec': es_source['spec'],
'sourceId': es_source['id']
self._special_fields.source_spec: es_source['spec'],
self._special_fields.source_id: es_source['id']
}

@cached_property
def _special_fields(self) -> SpecialFields:
return self.plugin.special_fields

def _make_bundle(self, es_bundle: JSON) -> MutableJSON:
return {
'bundleUuid': es_bundle['uuid'],
'bundleVersion': es_bundle['version']
self._special_fields.bundle_uuid: es_bundle['uuid'],
self._special_fields.bundle_version: es_bundle['version']
}

def _make_contents(self, es_contents: JSON) -> MutableJSON:
Expand Down
Loading

0 comments on commit 3ddf36e

Please sign in to comment.