Skip to content

Commit

Permalink
[r] Index orphaned replicas (#6626)
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Oct 15, 2024
1 parent 90a8b0d commit 0841d85
Show file tree
Hide file tree
Showing 7 changed files with 475 additions and 144 deletions.
1 change: 0 additions & 1 deletion src/azul/plugins/metadata/anvil/indexer/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,7 +569,6 @@ def transform(self,
partition: BundlePartition
) -> Iterable[Contribution | Replica]:
yield from super().transform(partition)
dataset_id = self._only_dataset().entity_id
for orphan in self.bundle.orphans:
if partition.contains(UUID(orphan.entity_id)):
yield self._replica(orphan, file_hub=None)
Expand Down
432 changes: 294 additions & 138 deletions src/azul/plugins/repository/tdr_anvil/__init__.py

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions src/azul/terra.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
bigquery,
)
from google.cloud.bigquery import (
DatasetReference,
QueryJob,
QueryJobConfig,
QueryPriority,
Expand Down Expand Up @@ -507,6 +508,14 @@ def run_sql(self, query: str) -> BigQueryRows:
log.debug('Job info: %s', json.dumps(self._job_info(job)))
return result

def list_tables(self, source: TDRSourceSpec) -> set[str]:
bigquery = self._bigquery(self.credentials.project_id)
ref = DatasetReference(project=source.subdomain, dataset_id=source.name)
return {
table.to_api_repr()['tableReference']['tableId']
for table in bigquery.list_tables(ref)
}

def _trunc_query(self, query: str) -> str:
return trunc_ellipses(query, 2048)

Expand Down
25 changes: 25 additions & 0 deletions src/azul/uuids.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,31 @@ def change_version(uuid: str, old_version: int, new_version: int) -> str:
return uuid


def zero_pad(prefix: str, version: int) -> str:
"""
Extend a prefix with zeros to produce a valid UUID.
>>> zero_pad('', 1)
'00000000-0000-1000-8000-000000000000'
>>> zero_pad('abcd', 4)
'abcd0000-0000-4000-8000-000000000000'
>>> zero_pad('f' * 32, 1)
'ffffffff-ffff-4fff-bfff-ffffffffffff'
>>> zero_pad('f' * 33, 1)
Traceback (most recent call last):
...
ValueError: badly formed hexadecimal UUID string
"""
# The intermediary representation is necessary to support non-standard
# versions
temp_version = 1
u = str(UUID(prefix.ljust(32, '0'), version=temp_version))
return change_version(u, temp_version, version)


UUID_PARTITION = TypeVar('UUID_PARTITION', bound='UUIDPartition')


Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 9 additions & 5 deletions test/indexer/test_anvil.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,15 @@ def bundle_fqid(cls,
*,
uuid,
version=None,
table_name=BundleType.primary.value
table_name=BundleType.primary.value,
batch_prefix_length=None
) -> TDRAnvilBundleFQID:
assert version is None, 'All AnVIL bundles should use the same version'
return TDRAnvilBundleFQID(source=cls.source,
uuid=uuid,
version=cls.version,
table_name=table_name)
table_name=table_name,
batch_prefix_length=batch_prefix_length)

@classmethod
def primary_bundle(cls) -> TDRAnvilBundleFQID:
Expand All @@ -120,7 +122,8 @@ def primary_bundle(cls) -> TDRAnvilBundleFQID:
@classmethod
def supplementary_bundle(cls) -> TDRAnvilBundleFQID:
return cls.bundle_fqid(uuid='6b0f6c0f-5d80-a242-accb-840921351cd5',
table_name=BundleType.supplementary.value)
table_name=BundleType.supplementary.value,
batch_prefix_length=0)

@classmethod
def duos_bundle(cls) -> TDRAnvilBundleFQID:
Expand All @@ -129,8 +132,9 @@ def duos_bundle(cls) -> TDRAnvilBundleFQID:

@classmethod
def replica_bundle(cls) -> TDRAnvilBundleFQID:
return cls.bundle_fqid(uuid='abc00000-0000-a000-0000-000000000000',
table_name='anvil_activity')
return cls.bundle_fqid(uuid='00000000-0000-a000-0000-000000000000',
table_name='anvil_activity',
batch_prefix_length=0)


class TestAnvilIndexer(AnvilIndexerTestCase,
Expand Down

0 comments on commit 0841d85

Please sign in to comment.