Skip to content

Commit

Permalink
[u r] Add type and domain fields to TDRSourceSpec (#6426)
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Jul 27, 2024
1 parent e358cdf commit 4589e62
Show file tree
Hide file tree
Showing 15 changed files with 95 additions and 42 deletions.
3 changes: 2 additions & 1 deletion UPGRADING.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ have too many entries in this file.
#6426 Clean-up and generalize TDR source specs
==============================================

The "snapshot/" string has been removed from TDR source specs.
The "snapshot/" string has been removed from TDR source specs, and the ``type``
and ``domain`` fields have been added.
Update the ``mksrc`` function in ``environment.py`` for each of your personal
deployments. As always, use the sandbox deployment's ``environment.py`` as a
model when upgrading personal deployments.
Expand Down
2 changes: 2 additions & 0 deletions deployments/anvilbox/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ def mksrc(google_project,
prefix = common_prefix(subgraphs)
source = None if flags & pop else ':'.join([
'tdr',
'bigquery',
'gcp',
google_project,
snapshot,
prefix + '/0'
Expand Down
2 changes: 2 additions & 0 deletions deployments/anvildev/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ def mksrc(google_project, snapshot, subgraphs, flags: int = 0) -> tuple[str, str
assert flags <= ma | pop
source = None if flags & pop else ':'.join([
'tdr',
'bigquery',
'gcp',
google_project,
snapshot,
'/' + str(partition_prefix_length(subgraphs))
Expand Down
2 changes: 2 additions & 0 deletions deployments/anvilprod/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ def mksrc(google_project, snapshot, subgraphs, flags: int = 0) -> tuple[str, str
assert flags <= ma | pop
source = None if flags & pop else ':'.join([
'tdr',
'bigquery',
'gcp',
google_project,
snapshot,
'/' + str(partition_prefix_length(subgraphs))
Expand Down
2 changes: 2 additions & 0 deletions deployments/dev/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ def mksrc(google_project, snapshot, subgraphs, flags: int = 0) -> tuple[str, str
assert flags <= ma | pop
source = None if flags & pop else ':'.join([
'tdr',
'bigquery',
'gcp',
google_project,
snapshot,
'/' + str(partition_prefix_length(subgraphs))
Expand Down
2 changes: 2 additions & 0 deletions deployments/hammerbox/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ def mksrc(google_project,
prefix = common_prefix(subgraphs)
source = None if flags & pop else ':'.join([
'tdr',
'bigquery',
'gcp',
google_project,
snapshot,
prefix + '/0'
Expand Down
2 changes: 2 additions & 0 deletions deployments/prod/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ def mksrc(google_project, snapshot, subgraphs, flags: int = 0) -> tuple[str, str
assert flags <= ma | pop
source = None if flags & pop else ':'.join([
'tdr',
'bigquery',
'gcp',
google_project,
snapshot,
'/' + str(partition_prefix_length(subgraphs))
Expand Down
2 changes: 2 additions & 0 deletions deployments/sandbox/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ def mksrc(google_project,
prefix = common_prefix(subgraphs)
source = None if flags & pop else ':'.join([
'tdr',
'bigquery',
'gcp',
google_project,
snapshot,
prefix + '/0'
Expand Down
2 changes: 2 additions & 0 deletions deployments/tempdev/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ def mksrc(google_project, snapshot, subgraphs, flags: int = 0) -> tuple[str, str
assert flags <= ma | pop
source = None if flags & pop else ':'.join([
'tdr',
'bigquery',
'gcp',
google_project,
snapshot,
'/' + str(partition_prefix_length(subgraphs))
Expand Down
58 changes: 47 additions & 11 deletions src/azul/terra.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from collections.abc import (
Sequence,
)
import enum
import json
import logging
from time import (
Expand Down Expand Up @@ -52,6 +53,7 @@
cache,
config,
mutable_furl,
reject,
require,
)
from azul.auth import (
Expand Down Expand Up @@ -91,37 +93,67 @@
log = logging.getLogger(__name__)


class SourceType(enum.Enum):
bigquery = 'bigquery'
parquet = 'parquet'


class SourceDomain(enum.Enum):
gcp = 'gcp'
azure = 'azure'


@attrs.frozen(kw_only=True)
class TDRSourceSpec(SourceSpec):
type: SourceType
domain: SourceDomain
subdomain: str
name: str

@classmethod
def parse(cls, spec: str) -> 'TDRSourceSpec':
"""
Construct an instance from its string representation, using the syntax
'tdr:{subdomain}:{name}:{prefix}' ending with an optional
'tdr:{type}{domain}{subdomain}:{name}:{prefix}' ending with an optional
'/{partition_prefix_length}'.
>>> s = TDRSourceSpec.parse('tdr:foo:bar:/0')
>>> s = TDRSourceSpec.parse('tdr:bigquery:gcp:foo:bar:/0')
>>> s # doctest: +NORMALIZE_WHITESPACE
TDRSourceSpec(prefix=Prefix(common='', partition=0),
type=<SourceType.bigquery: 'bigquery'>,
domain=<SourceDomain.gcp: 'gcp'>,
subdomain='foo',
name='bar')
>>> str(s)
'tdr:foo:bar:/0'
'tdr:bigquery:gcp:foo:bar:/0'
>>> TDRSourceSpec.parse('tdr:spam:gcp:foo:bar:/0')
Traceback (most recent call last):
...
ValueError: 'spam' is not a valid SourceType
>>> TDRSourceSpec.parse('tdr:bigquery:eggs:foo:bar:/0')
Traceback (most recent call last):
...
ValueError: 'eggs' is not a valid SourceDomain
>>> TDRSourceSpec.parse('tdr:foo:bar:n32/0')
>>> TDRSourceSpec.parse('tdr:bigquery:gcp:foo:bar:n32/0')
Traceback (most recent call last):
...
azul.uuids.InvalidUUIDPrefixError: 'n32' is not a valid UUID prefix.
"""
rest, prefix = cls._parse(spec)
# BigQuery (and by extension the TDR) does not allow : or / in dataset names
service, subdomain, name = rest.split(':')
service, type, domain, subdomain, name = rest.split(':')
assert service == 'tdr', service
type = SourceType(type)
reject(type == SourceType.parquet, 'Parquet sources are not yet supported')
domain = SourceDomain(domain)
reject(domain == SourceDomain.azure, 'Azure sources are not yet supported')
self = cls(prefix=prefix,
type=SourceType(type),
domain=SourceDomain(domain),
subdomain=subdomain,
name=name)
assert spec == str(self), spec
Expand All @@ -131,20 +163,22 @@ def __str__(self) -> str:
"""
The inverse of :meth:`parse`.
>>> s = 'tdr:foo:bar:/0'
>>> s = 'tdr:bigquery:gcp:foo:bar:/0'
>>> s == str(TDRSourceSpec.parse(s))
True
>>> s = 'tdr:foo:bar:22/0'
>>> s = 'tdr:bigquery:gcp:foo:bar:22/0'
>>> s == str(TDRSourceSpec.parse(s))
True
>>> s = 'tdr:foo:bar:22/2'
>>> s = 'tdr:bigquery:gcp:foo:bar:22/2'
>>> s == str(TDRSourceSpec.parse(s))
True
"""
return ':'.join([
'tdr',
self.type.value,
self.domain.value,
self.subdomain,
self.name,
str(self.prefix)
Expand All @@ -157,18 +191,20 @@ def contains(self, other: 'SourceSpec') -> bool:
"""
>>> p = TDRSourceSpec.parse
>>> p('tdr:foo:bar:/0').contains(p('tdr:foo:bar:/0'))
>>> p('tdr:bigquery:gcp:foo:bar:/0').contains(p('tdr:bigquery:gcp:foo:bar:/0'))
True
>>> p('tdr:foo:bar:/0').contains(p('tdr:bar:bar:/0'))
>>> p('tdr:bigquery:gcp:foo:bar:/0').contains(p('tdr:bigquery:gcp:bar:bar:/0'))
False
>>> p('tdr:foo:bar:/0').contains(p('tdr:foo:baz:/0'))
>>> p('tdr:bigquery:gcp:foo:bar:/0').contains(p('tdr:bigquery:gcp:foo:baz:/0'))
False
"""
return (
isinstance(other, TDRSourceSpec)
and super().contains(other)
and self.type == other.type
and self.domain == other.domain
and self.subdomain == other.subdomain
and self.name == other.name
)
Expand Down
4 changes: 2 additions & 2 deletions test/azul_test_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,7 @@ def _patch_source_cache(cls):

class DCP2TestCase(TDRTestCase):
source = TDRSourceRef(id='d8c20944-739f-4e7d-9161-b720953432ce',
spec=TDRSourceSpec.parse('tdr:test_hca_project:hca_snapshot:/2'))
spec=TDRSourceSpec.parse('tdr:bigquery:gcp:test_hca_project:hca_snapshot:/2'))

@classmethod
def catalog_config(cls) -> dict[CatalogName, config.Catalog]:
Expand All @@ -518,7 +518,7 @@ def catalog_config(cls) -> dict[CatalogName, config.Catalog]:

class AnvilTestCase(TDRTestCase):
source = TDRSourceRef(id='6c87f0e1-509d-46a4-b845-7584df39263b',
spec=TDRSourceSpec.parse('tdr:test_anvil_project:anvil_snapshot:/2'))
spec=TDRSourceSpec.parse('tdr:bigquery:gcp:test_anvil_project:anvil_snapshot:/2'))

@classmethod
def catalog_config(cls) -> dict[CatalogName, config.Catalog]:
Expand Down
32 changes: 16 additions & 16 deletions test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions test/service/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1680,9 +1680,9 @@ def test_compact_manifest(self):
),
(
'source_spec',
'tdr:test_anvil_project:anvil_snapshot:/2',
'tdr:test_anvil_project:anvil_snapshot:/2',
'tdr:test_anvil_project:anvil_snapshot:/2'
'tdr:bigquery:gcp:test_anvil_project:anvil_snapshot:/2',
'tdr:bigquery:gcp:test_anvil_project:anvil_snapshot:/2',
'tdr:bigquery:gcp:test_anvil_project:anvil_snapshot:/2'
),
(
'datasets.document_id',
Expand Down
Loading

0 comments on commit 4589e62

Please sign in to comment.