diff --git a/scripts/post_deploy_tdr.py b/scripts/post_deploy_tdr.py index 98c680f31..40411592f 100644 --- a/scripts/post_deploy_tdr.py +++ b/scripts/post_deploy_tdr.py @@ -102,7 +102,7 @@ def verify_source(self, require(prefix.common == '', source_spec) self.tdr.check_bigquery_access(source_spec) else: - subgraph_count = len(plugin.list_bundles(ref, prefix.common)) + subgraph_count = plugin.count_bundles(ref.spec) require(subgraph_count > 0, 'Common prefix is too long', ref.spec) require(subgraph_count <= 512, 'Common prefix is too short', ref.spec) diff --git a/src/azul/plugins/__init__.py b/src/azul/plugins/__init__.py index 893499fb2..c7dffb65e 100644 --- a/src/azul/plugins/__init__.py +++ b/src/azul/plugins/__init__.py @@ -615,9 +615,10 @@ def _lookup_source_id(self, spec: SOURCE_SPEC) -> str: raise NotImplementedError @abstractmethod - def _count_subgraphs(self, source: SOURCE_SPEC) -> int: + def count_bundles(self, source: SOURCE_SPEC) -> int: """ - The total number of subgraphs in the given source, ignoring its prefix. + The total number of subgraphs in the given source. The source's prefix + may be None. """ raise NotImplementedError @@ -631,7 +632,7 @@ def partition_source(self, should be appropriate for indexing in the given catalog. """ if source.spec.prefix is None: - count = self._count_subgraphs(source.spec) + count = self.count_bundles(source.spec) is_main = config.deployment.is_main is_it = catalog in config.integration_test_catalogs # We use the "lesser" heuristic during IT to avoid indexing an diff --git a/src/azul/plugins/repository/canned/__init__.py b/src/azul/plugins/repository/canned/__init__.py index 3c2705fdf..ea4c35f9e 100644 --- a/src/azul/plugins/repository/canned/__init__.py +++ b/src/azul/plugins/repository/canned/__init__.py @@ -26,6 +26,9 @@ from furl import ( furl, ) +from more_itertools import ( + ilen, +) from azul import ( CatalogName, @@ -163,9 +166,13 @@ def staging_area(self, url: str) -> StagingArea: ref) return factory.load_staging_area(path) - def _count_subgraphs(self, source: SOURCE_SPEC) -> int: + def count_bundles(self, source: SOURCE_SPEC) -> int: staging_area = self.staging_area(source.spec.name) - return len(staging_area.links) + return ilen( + links_id + for links_id in staging_area.links + if source.prefix is None or links_id.startswith(source.prefix.common) + ) def list_bundles(self, source: CannedSourceRef, diff --git a/src/azul/plugins/repository/dss/__init__.py b/src/azul/plugins/repository/dss/__init__.py index 96a6da973..4e5809f4a 100644 --- a/src/azul/plugins/repository/dss/__init__.py +++ b/src/azul/plugins/repository/dss/__init__.py @@ -118,7 +118,7 @@ def sources(self) -> AbstractSet[SimpleSourceSpec]: def _lookup_source_id(self, spec: SimpleSourceSpec) -> str: return DSSSourceRef.id_from_spec(spec) - def _count_subgraphs(self, source: SimpleSourceSpec) -> NoReturn: + def count_bundles(self, source: SimpleSourceSpec) -> NoReturn: assert False, 'DSS is EOL' def list_sources(self, diff --git a/src/azul/plugins/repository/tdr_anvil/__init__.py b/src/azul/plugins/repository/tdr_anvil/__init__.py index 3d7afc2b9..13449cd52 100644 --- a/src/azul/plugins/repository/tdr_anvil/__init__.py +++ b/src/azul/plugins/repository/tdr_anvil/__init__.py @@ -169,14 +169,16 @@ def _version(self): datarepo_row_uuid_version = 4 bundle_uuid_version = 10 - def _count_subgraphs(self, source: TDRSourceSpec) -> int: + def count_bundles(self, source: TDRSourceSpec) -> int: + prefix = '' if source.prefix is None else source.prefix.common rows = self._run_sql(f''' SELECT COUNT(*) AS count FROM {backtick(self._full_table_name(source, BundleType.primary.value))} + WHERE STARTS_WITH(datarepo_row_id, {prefix!r}) UNION ALL SELECT COUNT(*) AS count FROM {backtick(self._full_table_name(source, BundleType.supplementary.value))} - WHERE is_supplementary + WHERE is_supplementary AND STARTS_WITH(datarepo_row_id, {prefix!r}) ''') return sum(row['count'] for row in rows) diff --git a/src/azul/plugins/repository/tdr_hca/__init__.py b/src/azul/plugins/repository/tdr_hca/__init__.py index 320b0a4f1..2f10c564b 100644 --- a/src/azul/plugins/repository/tdr_hca/__init__.py +++ b/src/azul/plugins/repository/tdr_hca/__init__.py @@ -278,11 +278,14 @@ def _parse_drs_uri(self, class Plugin(TDRPlugin[TDRHCABundle, TDRSourceSpec, TDRSourceRef, TDRBundleFQID]): - def _count_subgraphs(self, source: TDRSourceSpec) -> int: - rows = self._run_sql(f''' - SELECT COUNT(*) AS count - FROM {backtick(self._full_table_name(source, 'links'))} - ''') + def count_bundles(self, source: TDRSourceSpec) -> int: + prefix = '' if source.prefix is None else source.prefix.common + query = f''' + SELECT COUNT(*) AS count + FROM {backtick(self._full_table_name(source, 'links'))} + WHERE STARTS_WITH(datarepo_row_id, {prefix!r}) + ''' + rows = self._run_sql(query) return one(rows)['count'] def _list_bundles(self,