From 07c5c12b5a1ca3a8e3aaecebcb493e4ad36e8f7a Mon Sep 17 00:00:00 2001 From: Jan Heinrich Reimer Date: Mon, 20 Nov 2023 17:00:39 +0100 Subject: [PATCH] Simplify doc counts --- archive_query_log/captures/__init__.py | 4 +--- archive_query_log/downloaders/warc.py | 4 +--- archive_query_log/monitoring/home.py | 4 +--- archive_query_log/parsers/url_offset.py | 4 +--- archive_query_log/parsers/url_page.py | 4 +--- archive_query_log/parsers/url_query.py | 4 +--- archive_query_log/parsers/warc_query.py | 7 +++---- archive_query_log/sources/__init__.py | 21 ++++----------------- 8 files changed, 13 insertions(+), 39 deletions(-) diff --git a/archive_query_log/captures/__init__.py b/archive_query_log/captures/__init__.py index b47b3477..a6ba2887 100644 --- a/archive_query_log/captures/__init__.py +++ b/archive_query_log/captures/__init__.py @@ -103,9 +103,7 @@ def fetch_captures(config: Config) -> None: ) .query(FunctionScore(functions=[RandomScore()])) ) - num_changed_sources = ( - changed_sources_search.extra(track_total_hits=True) - .execute().hits.total.value) + num_changed_sources = changed_sources_search.count() if num_changed_sources > 0: echo(f"Fetching captures for {num_changed_sources} " f"new/changed sources.") diff --git a/archive_query_log/downloaders/warc.py b/archive_query_log/downloaders/warc.py index 42a34ef9..4f78f26c 100644 --- a/archive_query_log/downloaders/warc.py +++ b/archive_query_log/downloaders/warc.py @@ -106,9 +106,7 @@ def download_serps_warc(config: Config) -> None: ) .query(FunctionScore(functions=[RandomScore()])) ) - num_changed_serps = ( - changed_serps_search.extra(track_total_hits=True) - .execute().hits.total.value) + num_changed_serps = changed_serps_search.count() if num_changed_serps <= 0: echo("No new/changed captures.") diff --git a/archive_query_log/monitoring/home.py b/archive_query_log/monitoring/home.py index 24d9fab9..8ec40dc8 100644 --- a/archive_query_log/monitoring/home.py +++ b/archive_query_log/monitoring/home.py @@ -108,9 +108,7 @@ def _get_processed_progress( f"doc['{timestamp_field}'].value)", ) ) - total_processed = (search_processed.extra(track_total_hits=True).execute() - .hits.total.value) - + total_processed = search_processed.count() progress = Progress( name=name, description=description, diff --git a/archive_query_log/parsers/url_offset.py b/archive_query_log/parsers/url_offset.py index 57091b33..df6ecafb 100644 --- a/archive_query_log/parsers/url_offset.py +++ b/archive_query_log/parsers/url_offset.py @@ -178,9 +178,7 @@ def parse_serps_url_offset(config: Config) -> None: ) .query(FunctionScore(functions=[RandomScore()])) ) - num_changed_serps = ( - changed_serps_search.extra(track_total_hits=True) - .execute().hits.total.value) + num_changed_serps = changed_serps_search.count() if num_changed_serps > 0: changed_serps: Iterable[Serp] = changed_serps_search.scan() changed_serps = safe_iter_scan(changed_serps) diff --git a/archive_query_log/parsers/url_page.py b/archive_query_log/parsers/url_page.py index 6963bc22..737197fb 100644 --- a/archive_query_log/parsers/url_page.py +++ b/archive_query_log/parsers/url_page.py @@ -178,9 +178,7 @@ def parse_serps_url_page(config: Config) -> None: ) .query(FunctionScore(functions=[RandomScore()])) ) - num_changed_serps = ( - changed_serps_search.extra(track_total_hits=True) - .execute().hits.total.value) + num_changed_serps = changed_serps_search.count() if num_changed_serps > 0: changed_serps: Iterable[Serp] = changed_serps_search.scan() changed_serps = safe_iter_scan(changed_serps) diff --git a/archive_query_log/parsers/url_query.py b/archive_query_log/parsers/url_query.py index 1ff8990e..def4d211 100644 --- a/archive_query_log/parsers/url_query.py +++ b/archive_query_log/parsers/url_query.py @@ -191,9 +191,7 @@ def parse_serps_url_query(config: Config) -> None: ) .query(FunctionScore(functions=[RandomScore()])) ) - num_changed_captures = ( - changed_captures_search.extra(track_total_hits=True) - .execute().hits.total.value) + num_changed_captures = changed_captures_search.count() if num_changed_captures > 0: changed_captures: Iterable[Capture] = changed_captures_search.scan() changed_captures = safe_iter_scan(changed_captures) diff --git a/archive_query_log/parsers/warc_query.py b/archive_query_log/parsers/warc_query.py index 086fe877..71987e88 100644 --- a/archive_query_log/parsers/warc_query.py +++ b/archive_query_log/parsers/warc_query.py @@ -17,7 +17,8 @@ from archive_query_log.namespaces import NAMESPACE_WARC_QUERY_PARSER from archive_query_log.orm import Serp, InnerParser, InnerProviderId, \ WarcQueryParserType, WarcQueryParser, WarcLocation -from archive_query_log.parsers.xml import parse_xml_tree, get_xml_xpath_non_empty_string +from archive_query_log.parsers.xml import parse_xml_tree, \ + get_xml_xpath_non_empty_string from archive_query_log.utils.es import safe_iter_scan, update_action from archive_query_log.utils.time import utc_now @@ -188,9 +189,7 @@ def parse_serps_warc_query(config: Config) -> None: ) .query(FunctionScore(functions=[RandomScore()])) ) - num_changed_serps = ( - changed_serps_search.extra(track_total_hits=True) - .execute().hits.total.value) + num_changed_serps = changed_serps_search.count() if num_changed_serps > 0: changed_serps: Iterable[Serp] = changed_serps_search.scan() changed_serps = safe_iter_scan(changed_serps) diff --git a/archive_query_log/sources/__init__.py b/archive_query_log/sources/__init__.py index 61ba6410..f9d84e7f 100644 --- a/archive_query_log/sources/__init__.py +++ b/archive_query_log/sources/__init__.py @@ -114,17 +114,11 @@ def _build_archive_sources(config: Config) -> None: ) .query(FunctionScore(functions=[RandomScore()])) ) - num_changed_archives_search = ( - changed_archives_search.extra(track_total_hits=True)) - num_changed_archives = ( - num_changed_archives_search.execute().hits.total.value) + num_changed_archives = changed_archives_search.count() all_providers_search = ( Provider.search(using=config.es.client) .filter(~Exists(field="exclusion_reason"))) - num_all_providers_search = ( - all_providers_search.extra(track_total_hits=True)) - num_all_providers = ( - num_all_providers_search.execute().hits.total.value) + num_all_providers = all_providers_search.count() num_batches_archives = ( num_changed_archives * num_all_providers + num_changed_archives) @@ -170,16 +164,9 @@ def _build_provider_sources(config: Config) -> None: ) .query(FunctionScore(functions=[RandomScore()])) ) - num_changed_providers_search = ( - changed_providers_search.extra(track_total_hits=True)) - num_changed_providers = ( - num_changed_providers_search.execute().hits.total.value) + num_changed_providers = changed_providers_search.count() all_archives_search = Archive.search(using=config.es.client) - num_all_archives_search = ( - all_archives_search.extra(track_total_hits=True)) - # pylint: disable=no-member - num_all_archives = ( - num_all_archives_search.execute().hits.total.value) + num_all_archives = all_archives_search.count() num_batches_providers = ( num_changed_providers * num_all_archives + num_changed_providers)