Skip to content

Commit

Permalink
[r] Index orphaned replicas (#6626)
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Oct 16, 2024
1 parent c00ca1c commit 71bda1c
Show file tree
Hide file tree
Showing 7 changed files with 489 additions and 146 deletions.
427 changes: 293 additions & 134 deletions src/azul/plugins/repository/tdr_anvil/__init__.py

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions src/azul/terra.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
bigquery,
)
from google.cloud.bigquery import (
DatasetReference,
QueryJob,
QueryJobConfig,
QueryPriority,
Expand Down Expand Up @@ -507,6 +508,14 @@ def run_sql(self, query: str) -> BigQueryRows:
log.debug('Job info: %s', json.dumps(self._job_info(job)))
return result

def list_tables(self, source: TDRSourceSpec) -> set[str]:
bigquery = self._bigquery(self.credentials.project_id)
ref = DatasetReference(project=source.subdomain, dataset_id=source.name)
return {
table.to_api_repr()['tableReference']['tableId']
for table in bigquery.list_tables(ref)
}

def _trunc_query(self, query: str) -> str:
return trunc_ellipses(query, 2048)

Expand Down
25 changes: 25 additions & 0 deletions src/azul/uuids.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,31 @@ def change_version(uuid: str, old_version: int, new_version: int) -> str:
return uuid


def zero_pad(prefix: str, version: int) -> str:
"""
Extend a prefix with zeros to produce a valid UUID.
>>> zero_pad('', 1)
'00000000-0000-1000-8000-000000000000'
>>> zero_pad('abcd', 4)
'abcd0000-0000-4000-8000-000000000000'
>>> zero_pad('f' * 32, 1)
'ffffffff-ffff-1fff-bfff-ffffffffffff'
>>> zero_pad('f' * 33, 1)
Traceback (most recent call last):
...
ValueError: badly formed hexadecimal UUID string
"""
# The intermediary representation is necessary to support non-standard
# versions
temp_version = 1
u = str(UUID(prefix.ljust(32, '0'), version=temp_version))
return change_version(u, temp_version, version)


UUID_PARTITION = TypeVar('UUID_PARTITION', bound='UUIDPartition')


Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

35 changes: 25 additions & 10 deletions test/indexer/test_anvil.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@
from azul.terra import (
TDRClient,
)
from azul.uuids import (
zero_pad,
)
from azul_test_case import (
TDRTestCase,
)
Expand Down Expand Up @@ -105,12 +108,14 @@ def bundle_fqid(cls,
*,
uuid,
version=None,
table_name=BundleType.primary.value
table_name=BundleType.primary.value,
batch_prefix_length=None
) -> TDRAnvilBundleFQID:
assert version is None, 'All AnVIL bundles should use the same version'
return TDRAnvilBundleFQID(source=cls.source,
uuid=uuid,
version=table_name)
version=table_name,
batch_prefix_length=batch_prefix_length)

@classmethod
def primary_bundle(cls) -> TDRAnvilBundleFQID:
Expand All @@ -119,7 +124,8 @@ def primary_bundle(cls) -> TDRAnvilBundleFQID:
@classmethod
def supplementary_bundle(cls) -> TDRAnvilBundleFQID:
return cls.bundle_fqid(uuid='6b0f6c0f-5d80-a242-accb-840921351cd5',
table_name=BundleType.supplementary.value)
table_name=BundleType.supplementary.value,
batch_prefix_length=0)

@classmethod
def duos_bundle(cls) -> TDRAnvilBundleFQID:
Expand All @@ -128,8 +134,9 @@ def duos_bundle(cls) -> TDRAnvilBundleFQID:

@classmethod
def replica_bundle(cls) -> TDRAnvilBundleFQID:
return cls.bundle_fqid(uuid='abc00000-0000-a000-0000-000000000000',
table_name='anvil_activity')
return cls.bundle_fqid(uuid='00000000-0000-a000-0000-000000000000',
table_name='anvil_activity',
batch_prefix_length=0)


class TestAnvilIndexer(AnvilIndexerTestCase,
Expand Down Expand Up @@ -169,13 +176,21 @@ def test_indexing(self):

def test_list_and_fetch_bundles(self):
source_ref = self.source
self._make_mock_tdr_tables(source_ref)
expected_bundle_fqids = sorted([
tables = self._make_mock_tdr_tables(source_ref)
plugin = self.plugin_for_source_spec(source_ref.spec)
unbatched_bundles = [
self.primary_bundle(),
self.supplementary_bundle(),
self.duos_bundle()
])
plugin = self.plugin_for_source_spec(source_ref.spec)
]
batched_bundles = [
self.bundle_fqid(uuid=zero_pad('', plugin.bundle_uuid_version),
table_name=table_name,
batch_prefix_length=self.source.spec.prefix.partition)
for table_name in tables.keys()
if table_name not in (BundleType.primary.value, BundleType.duos.value)
]
self.assertIn(self.supplementary_bundle(), batched_bundles)
expected_bundle_fqids = sorted(unbatched_bundles + batched_bundles)
bundle_fqids = sorted(plugin.list_bundles(source_ref, ''))
self.assertEqual(expected_bundle_fqids, bundle_fqids)
for bundle_fqid in bundle_fqids:
Expand Down
4 changes: 2 additions & 2 deletions test/indexer/test_tdr.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,15 +197,15 @@ def setUpClass(cls):
'--dataset=' + cls.source.spec.name
])

def _make_mock_tdr_tables(self,
source: TDRSourceRef) -> None:
def _make_mock_tdr_tables(self, source: TDRSourceRef) -> JSON:
tables = self._load_canned_file_version(uuid=source.id,
version=None,
extension='tables.tdr')['tables']
for table_name, table_rows in tables.items():
self._make_mock_entity_table(source.spec,
table_name,
table_rows['rows'])
return tables

def _make_mock_entity_table(self,
source: TDRSourceSpec,
Expand Down

0 comments on commit 71bda1c

Please sign in to comment.