Skip to content

Commit

Permalink
[r] Emit replicas for AnVIL entities not linked to files
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Nov 7, 2024
1 parent d9280cc commit c8a084f
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 10 deletions.
23 changes: 16 additions & 7 deletions src/azul/plugins/metadata/anvil/indexer/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,22 @@ def _singleton(self) -> EntityReference:
return EntityReference(entity_type='bundle',
entity_id=self.bundle.uuid)

def transform(self,
partition: BundlePartition
) -> Iterable[Contribution | Replica]:
yield from super().transform(partition)
if config.enable_replicas:
# Replicas are only emitted by the file transformer for entities
# that are linked to at least one file. This excludes a small number
# of linked entities, usually from primary bundles that don't
# include any files. Some of the replicas we emit here will be
# redundant with those emitted by the file transformer, but these
# will be coalesced by the index service before they are written to
# ElasticSearch.
for entity in self.bundle.entities:
if partition.contains(UUID(entity.entity_id)):
yield self._replica(entity, file_hub=None)


class DatasetTransformer(SingletonTransformer):

Expand All @@ -574,13 +590,6 @@ def entity_type(cls) -> str:
def _singleton(self) -> EntityReference:
return self._only_dataset()

def _transform(self,
entity: EntityReference
) -> Iterable[Contribution | Replica]:
yield from super()._transform(entity)
if self._is_duos(entity):
yield self._replica(entity, file_hub=None)


class DonorTransformer(BaseTransformer):

Expand Down
4 changes: 1 addition & 3 deletions test/indexer/test_anvil.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,5 @@ def test_dataset_description(self):
self.assertDictEqual(doc_counts, {
DocumentType.aggregate: 1,
DocumentType.contribution: 2,
# No replica is emitted for the primary dataset because we dropped
# the files (hubs) from its bundle above
**({DocumentType.replica: 1} if config.enable_replicas else {})
**({DocumentType.replica: 2} if config.enable_replicas else {})
})

0 comments on commit c8a084f

Please sign in to comment.