Skip to content

Commit

Permalink
Simplify doc counts
Browse files Browse the repository at this point in the history
  • Loading branch information
janheinrichmerker committed Nov 20, 2023
1 parent 349227c commit 07c5c12
Show file tree
Hide file tree
Showing 8 changed files with 13 additions and 39 deletions.
4 changes: 1 addition & 3 deletions archive_query_log/captures/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,7 @@ def fetch_captures(config: Config) -> None:
)
.query(FunctionScore(functions=[RandomScore()]))
)
num_changed_sources = (
changed_sources_search.extra(track_total_hits=True)
.execute().hits.total.value)
num_changed_sources = changed_sources_search.count()
if num_changed_sources > 0:
echo(f"Fetching captures for {num_changed_sources} "
f"new/changed sources.")
Expand Down
4 changes: 1 addition & 3 deletions archive_query_log/downloaders/warc.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,7 @@ def download_serps_warc(config: Config) -> None:
)
.query(FunctionScore(functions=[RandomScore()]))
)
num_changed_serps = (
changed_serps_search.extra(track_total_hits=True)
.execute().hits.total.value)
num_changed_serps = changed_serps_search.count()

if num_changed_serps <= 0:
echo("No new/changed captures.")
Expand Down
4 changes: 1 addition & 3 deletions archive_query_log/monitoring/home.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,7 @@ def _get_processed_progress(
f"doc['{timestamp_field}'].value)",
)
)
total_processed = (search_processed.extra(track_total_hits=True).execute()
.hits.total.value)

total_processed = search_processed.count()
progress = Progress(
name=name,
description=description,
Expand Down
4 changes: 1 addition & 3 deletions archive_query_log/parsers/url_offset.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,7 @@ def parse_serps_url_offset(config: Config) -> None:
)
.query(FunctionScore(functions=[RandomScore()]))
)
num_changed_serps = (
changed_serps_search.extra(track_total_hits=True)
.execute().hits.total.value)
num_changed_serps = changed_serps_search.count()
if num_changed_serps > 0:
changed_serps: Iterable[Serp] = changed_serps_search.scan()
changed_serps = safe_iter_scan(changed_serps)
Expand Down
4 changes: 1 addition & 3 deletions archive_query_log/parsers/url_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,7 @@ def parse_serps_url_page(config: Config) -> None:
)
.query(FunctionScore(functions=[RandomScore()]))
)
num_changed_serps = (
changed_serps_search.extra(track_total_hits=True)
.execute().hits.total.value)
num_changed_serps = changed_serps_search.count()
if num_changed_serps > 0:
changed_serps: Iterable[Serp] = changed_serps_search.scan()
changed_serps = safe_iter_scan(changed_serps)
Expand Down
4 changes: 1 addition & 3 deletions archive_query_log/parsers/url_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,9 +191,7 @@ def parse_serps_url_query(config: Config) -> None:
)
.query(FunctionScore(functions=[RandomScore()]))
)
num_changed_captures = (
changed_captures_search.extra(track_total_hits=True)
.execute().hits.total.value)
num_changed_captures = changed_captures_search.count()
if num_changed_captures > 0:
changed_captures: Iterable[Capture] = changed_captures_search.scan()
changed_captures = safe_iter_scan(changed_captures)
Expand Down
7 changes: 3 additions & 4 deletions archive_query_log/parsers/warc_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
from archive_query_log.namespaces import NAMESPACE_WARC_QUERY_PARSER
from archive_query_log.orm import Serp, InnerParser, InnerProviderId, \
WarcQueryParserType, WarcQueryParser, WarcLocation
from archive_query_log.parsers.xml import parse_xml_tree, get_xml_xpath_non_empty_string
from archive_query_log.parsers.xml import parse_xml_tree, \
get_xml_xpath_non_empty_string
from archive_query_log.utils.es import safe_iter_scan, update_action
from archive_query_log.utils.time import utc_now

Expand Down Expand Up @@ -188,9 +189,7 @@ def parse_serps_warc_query(config: Config) -> None:
)
.query(FunctionScore(functions=[RandomScore()]))
)
num_changed_serps = (
changed_serps_search.extra(track_total_hits=True)
.execute().hits.total.value)
num_changed_serps = changed_serps_search.count()
if num_changed_serps > 0:
changed_serps: Iterable[Serp] = changed_serps_search.scan()
changed_serps = safe_iter_scan(changed_serps)
Expand Down
21 changes: 4 additions & 17 deletions archive_query_log/sources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,17 +114,11 @@ def _build_archive_sources(config: Config) -> None:
)
.query(FunctionScore(functions=[RandomScore()]))
)
num_changed_archives_search = (
changed_archives_search.extra(track_total_hits=True))
num_changed_archives = (
num_changed_archives_search.execute().hits.total.value)
num_changed_archives = changed_archives_search.count()
all_providers_search = (
Provider.search(using=config.es.client)
.filter(~Exists(field="exclusion_reason")))
num_all_providers_search = (
all_providers_search.extra(track_total_hits=True))
num_all_providers = (
num_all_providers_search.execute().hits.total.value)
num_all_providers = all_providers_search.count()
num_batches_archives = (
num_changed_archives * num_all_providers +
num_changed_archives)
Expand Down Expand Up @@ -170,16 +164,9 @@ def _build_provider_sources(config: Config) -> None:
)
.query(FunctionScore(functions=[RandomScore()]))
)
num_changed_providers_search = (
changed_providers_search.extra(track_total_hits=True))
num_changed_providers = (
num_changed_providers_search.execute().hits.total.value)
num_changed_providers = changed_providers_search.count()
all_archives_search = Archive.search(using=config.es.client)
num_all_archives_search = (
all_archives_search.extra(track_total_hits=True))
# pylint: disable=no-member
num_all_archives = (
num_all_archives_search.execute().hits.total.value)
num_all_archives = all_archives_search.count()
num_batches_providers = (
num_changed_providers * num_all_archives +
num_changed_providers)
Expand Down

0 comments on commit 07c5c12

Please sign in to comment.