diff --git a/src/crashstats_tools/cmd_fetch_data.py b/src/crashstats_tools/cmd_fetch_data.py index 03a00a4..4d612be 100644 --- a/src/crashstats_tools/cmd_fetch_data.py +++ b/src/crashstats_tools/cmd_fetch_data.py @@ -2,11 +2,13 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +from datetime import timedelta from functools import partial import json from multiprocessing import Pool import os import sys +import time import click from rich.console import Console @@ -35,9 +37,10 @@ def fetch_crash( fetchraw, fetchdumps, fetchprocessed, - outputdir, color, overwrite, + stats, + outputdir, ): """Fetch crash data and save to correct place on the file system @@ -59,9 +62,11 @@ def fetch_crash( # Fetch raw crash metadata to OUTPUTDIR/raw_crash/DATE/CRASHID fn = os.path.join(outputdir, "raw_crash", "20" + crash_id[-6:], crash_id) if os.path.exists(fn) and not overwrite: - console.print(f"{crash_id}: fetching raw crash -- already exists") + if not stats: + console.print(f"{crash_id}: fetching raw crash -- already exists") else: - console.print(f"{crash_id}: fetching raw crash") + if not stats: + console.print(f"{crash_id}: fetching raw crash") raw_crash = get_crash_annotations(crash_id, host=host, api_token=api_token) # Save raw crash to file system @@ -87,11 +92,13 @@ def fetch_crash( fn = os.path.join(outputdir, dump_name, crash_id) if os.path.exists(fn) and not overwrite: - console.print( - f"{crash_id}: fetching dump: {dump_name} -- already exists" - ) + if not stats: + console.print( + f"{crash_id}: fetching dump: {dump_name} -- already exists" + ) else: - console.print(f"{crash_id}: fetching dump: {dump_name}") + if not stats: + console.print(f"{crash_id}: fetching dump: {dump_name}") dump_content = get_dump( crash_id, dump_name=file_name, api_token=api_token, host=host ) @@ -103,9 +110,11 @@ def fetch_crash( # Fetch processed crash data fn = os.path.join(outputdir, "processed_crash", crash_id) if os.path.exists(fn) and not overwrite: - console.print(f"{crash_id}: fetching processed crash -- already exists") + if not stats: + console.print(f"{crash_id}: fetching processed crash -- already exists") else: - console.print(f"{crash_id}: fetching processed crash") + if not stats: + console.print(f"{crash_id}: fetching processed crash") processed_crash = get_processed_crash( crash_id, api_token=api_token, host=host ) @@ -153,6 +162,14 @@ def fetch_crash( type=click.IntRange(1, 10, clamp=True), help="how many workers to use to download data; requires CRASHSTATS_API_TOKEN", ) +@click.option( + "--stats/--no-stats", + default=False, + help=( + "prints download stats for large fetch-data jobs; if it's printing download " + "stats, it's not printing other things" + ), +) @click.option( "--color/--no-color", default=True, @@ -172,6 +189,7 @@ def fetch_data( fetchdumps, fetchprocessed, workers, + stats, color, outputdir, crash_ids, @@ -259,17 +277,44 @@ def fetch_data( fetchdumps=fetchdumps, fetchprocessed=fetchprocessed, color=color, - outputdir=outputdir, overwrite=overwrite, + stats=stats, + outputdir=outputdir, ) + start_time = time.time() + total = len(crash_ids) + i = 0 + if workers > 1: - with Pool(workers) as p: - p.map(fetch_crash_partial, crash_ids) + with Pool(workers) as pool: + for _ in pool.imap(fetch_crash_partial, crash_ids): + if stats: + # Print something every 100 + if i % 100 == 0: + seconds_per_item = (time.time() - start_time) / (i + 1) + estimate_left = str( + timedelta(seconds=seconds_per_item * (total - i + 1)) + ) + console.print( + (f"Downloaded ({i}/{total}) {estimate_left}").strip() + ) + i += 1 else: for crash_id in crash_ids: fetch_crash_partial(crash_id) + if stats: + if i % 100 == 0: + seconds_per_item = (time.time() - start_time) / (i + 1) + estimate_left = str( + timedelta(seconds=int(seconds_per_item * (total - i + 1))) + ) + console.print((f"Downloaded ({i}/{total}) {estimate_left}").strip()) + i += 1 + + total_time = timedelta(seconds=int(time.time() - start_time)) + console.print(f"Completed in {total_time}.") if __name__ == "__main__": diff --git a/tests/test_fetch_data.py b/tests/test_fetch_data.py index f2402ea..93c6710 100644 --- a/tests/test_fetch_data.py +++ b/tests/test_fetch_data.py @@ -61,6 +61,7 @@ def test_fetch_raw(tmpdir): No API token provided. Set CRASHSTATS_API_TOKEN in the environment. Skipping dumps and protected data. 2ac9a763-83d2-4dca-89bb-091bd0220630: fetching raw crash + Completed in 0:00:00. """ ) data = pathlib.Path( @@ -110,6 +111,7 @@ def test_fetch_raw_with_token(tmpdir): """\ Using API token: 935exxxxxxxxxxxxxxxxxxxxxxxxxxxx 2ac9a763-83d2-4dca-89bb-091bd0220630: fetching raw crash + Completed in 0:00:00. """ ) data = pathlib.Path( @@ -236,6 +238,7 @@ def test_fetch_dumps(tmpdir): Using API token: 935exxxxxxxxxxxxxxxxxxxxxxxxxxxx 2ac9a763-83d2-4dca-89bb-091bd0220630: fetching raw crash 2ac9a763-83d2-4dca-89bb-091bd0220630: fetching dump: upload_file_minidump + Completed in 0:00:00. """ ) data = pathlib.Path( @@ -283,6 +286,7 @@ def test_fetch_processed(tmpdir): No API token provided. Set CRASHSTATS_API_TOKEN in the environment. Skipping dumps and protected data. 2ac9a763-83d2-4dca-89bb-091bd0220630: fetching processed crash + Completed in 0:00:00. """ ) data = pathlib.Path(tmpdir / "processed_crash" / crash_id).read_bytes() @@ -329,6 +333,7 @@ def test_fetch_processed_with_token(tmpdir): """\ Using API token: 935exxxxxxxxxxxxxxxxxxxxxxxxxxxx 2ac9a763-83d2-4dca-89bb-091bd0220630: fetching processed crash + Completed in 0:00:00. """ ) data = pathlib.Path(tmpdir / "processed_crash" / crash_id).read_bytes() @@ -379,9 +384,58 @@ def test_host(tmpdir): No API token provided. Set CRASHSTATS_API_TOKEN in the environment. Skipping dumps and protected data. 2ac9a763-83d2-4dca-89bb-091bd0220630: fetching raw crash + Completed in 0:00:00. """ ) data = pathlib.Path( tmpdir / "raw_crash" / f"20{crash_id[-6:]}" / crash_id ).read_bytes() assert json.loads(data) == raw_crash + + +@responses.activate +def test_stats(tmpdir): + crash_id = "2ac9a763-83d2-4dca-89bb-091bd0220630" + raw_crash = { + "ProductName": "Firefox", + "Version": "100.0", + } + + responses.add( + responses.GET, + DEFAULT_HOST + "/api/RawCrash/", + match=[ + responses.matchers.query_param_matcher( + { + "crash_id": crash_id, + "format": "meta", + } + ) + ], + status=200, + json=raw_crash, + ) + + runner = CliRunner() + args = [ + "--raw", + "--no-dumps", + "--no-processed", + "--stats", + str(tmpdir), + crash_id, + ] + result = runner.invoke( + cli=cmd_fetch_data.fetch_data, + args=args, + env={"COLUMNS": "100"}, + ) + assert result.exit_code == 0 + assert result.output == dedent( + """\ + No API token provided. Set CRASHSTATS_API_TOKEN in the environment. + Skipping dumps and protected data. + Downloaded (0/1) 0:00:00 + Completed in 0:00:00. + """ + ) diff --git a/tests/test_reprocess.py b/tests/test_reprocess.py index 7e83b7d..8063641 100644 --- a/tests/test_reprocess.py +++ b/tests/test_reprocess.py @@ -284,7 +284,6 @@ def test_reprocess_tenthousand_allowmany(): """ ) ) - print(result.output.splitlines()[-3:]) assert result.output.endswith( dedent( f"""\ diff --git a/tests/test_supersearch.py b/tests/test_supersearch.py index 63bcd25..56e0f4e 100644 --- a/tests/test_supersearch.py +++ b/tests/test_supersearch.py @@ -249,7 +249,6 @@ def test_json(): args=["--_columns=uuid", "--_columns=signature", "--format=json"], env={"COLUMNS": "100"}, ) - print(result.output) assert result.exit_code == 0 assert result.output == dedent( """\ diff --git a/tests/test_supersearchfacet.py b/tests/test_supersearchfacet.py index 10b27c0..a2d2925 100644 --- a/tests/test_supersearchfacet.py +++ b/tests/test_supersearchfacet.py @@ -412,7 +412,6 @@ def test_supersearch_url(): ], env={"COLUMNS": "100"}, ) - print(result.output) assert result.exit_code == 0 assert result.output == dedent( """\