Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add download stats and estimate to fetch-data (#149) #150

Merged
merged 1 commit into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 57 additions & 12 deletions src/crashstats_tools/cmd_fetch_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

from datetime import timedelta
from functools import partial
import json
from multiprocessing import Pool
import os
import sys
import time

import click
from rich.console import Console
Expand Down Expand Up @@ -35,9 +37,10 @@ def fetch_crash(
fetchraw,
fetchdumps,
fetchprocessed,
outputdir,
color,
overwrite,
stats,
outputdir,
):
"""Fetch crash data and save to correct place on the file system

Expand All @@ -59,9 +62,11 @@ def fetch_crash(
# Fetch raw crash metadata to OUTPUTDIR/raw_crash/DATE/CRASHID
fn = os.path.join(outputdir, "raw_crash", "20" + crash_id[-6:], crash_id)
if os.path.exists(fn) and not overwrite:
console.print(f"{crash_id}: fetching raw crash -- already exists")
if not stats:
console.print(f"{crash_id}: fetching raw crash -- already exists")
else:
console.print(f"{crash_id}: fetching raw crash")
if not stats:
console.print(f"{crash_id}: fetching raw crash")
raw_crash = get_crash_annotations(crash_id, host=host, api_token=api_token)

# Save raw crash to file system
Expand All @@ -87,11 +92,13 @@ def fetch_crash(

fn = os.path.join(outputdir, dump_name, crash_id)
if os.path.exists(fn) and not overwrite:
console.print(
f"{crash_id}: fetching dump: {dump_name} -- already exists"
)
if not stats:
console.print(
f"{crash_id}: fetching dump: {dump_name} -- already exists"
)
else:
console.print(f"{crash_id}: fetching dump: {dump_name}")
if not stats:
console.print(f"{crash_id}: fetching dump: {dump_name}")
dump_content = get_dump(
crash_id, dump_name=file_name, api_token=api_token, host=host
)
Expand All @@ -103,9 +110,11 @@ def fetch_crash(
# Fetch processed crash data
fn = os.path.join(outputdir, "processed_crash", crash_id)
if os.path.exists(fn) and not overwrite:
console.print(f"{crash_id}: fetching processed crash -- already exists")
if not stats:
console.print(f"{crash_id}: fetching processed crash -- already exists")
else:
console.print(f"{crash_id}: fetching processed crash")
if not stats:
console.print(f"{crash_id}: fetching processed crash")
processed_crash = get_processed_crash(
crash_id, api_token=api_token, host=host
)
Expand Down Expand Up @@ -153,6 +162,14 @@ def fetch_crash(
type=click.IntRange(1, 10, clamp=True),
help="how many workers to use to download data; requires CRASHSTATS_API_TOKEN",
)
@click.option(
"--stats/--no-stats",
default=False,
help=(
"prints download stats for large fetch-data jobs; if it's printing download "
"stats, it's not printing other things"
),
)
@click.option(
"--color/--no-color",
default=True,
Expand All @@ -172,6 +189,7 @@ def fetch_data(
fetchdumps,
fetchprocessed,
workers,
stats,
color,
outputdir,
crash_ids,
Expand Down Expand Up @@ -259,17 +277,44 @@ def fetch_data(
fetchdumps=fetchdumps,
fetchprocessed=fetchprocessed,
color=color,
outputdir=outputdir,
overwrite=overwrite,
stats=stats,
outputdir=outputdir,
)

start_time = time.time()
total = len(crash_ids)
i = 0

if workers > 1:
with Pool(workers) as p:
p.map(fetch_crash_partial, crash_ids)
with Pool(workers) as pool:
for _ in pool.imap(fetch_crash_partial, crash_ids):
if stats:
# Print something every 100
if i % 100 == 0:
seconds_per_item = (time.time() - start_time) / (i + 1)
estimate_left = str(
timedelta(seconds=seconds_per_item * (total - i + 1))
)
console.print(
(f"Downloaded ({i}/{total}) {estimate_left}").strip()
)
i += 1

else:
for crash_id in crash_ids:
fetch_crash_partial(crash_id)
if stats:
if i % 100 == 0:
seconds_per_item = (time.time() - start_time) / (i + 1)
estimate_left = str(
timedelta(seconds=int(seconds_per_item * (total - i + 1)))
)
console.print((f"Downloaded ({i}/{total}) {estimate_left}").strip())
i += 1

total_time = timedelta(seconds=int(time.time() - start_time))
console.print(f"Completed in {total_time}.")


if __name__ == "__main__":
Expand Down
54 changes: 54 additions & 0 deletions tests/test_fetch_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def test_fetch_raw(tmpdir):
No API token provided. Set CRASHSTATS_API_TOKEN in the environment.
Skipping dumps and protected data.
2ac9a763-83d2-4dca-89bb-091bd0220630: fetching raw crash
Completed in 0:00:00.
"""
)
data = pathlib.Path(
Expand Down Expand Up @@ -110,6 +111,7 @@ def test_fetch_raw_with_token(tmpdir):
"""\
Using API token: 935exxxxxxxxxxxxxxxxxxxxxxxxxxxx
2ac9a763-83d2-4dca-89bb-091bd0220630: fetching raw crash
Completed in 0:00:00.
"""
)
data = pathlib.Path(
Expand Down Expand Up @@ -236,6 +238,7 @@ def test_fetch_dumps(tmpdir):
Using API token: 935exxxxxxxxxxxxxxxxxxxxxxxxxxxx
2ac9a763-83d2-4dca-89bb-091bd0220630: fetching raw crash
2ac9a763-83d2-4dca-89bb-091bd0220630: fetching dump: upload_file_minidump
Completed in 0:00:00.
"""
)
data = pathlib.Path(
Expand Down Expand Up @@ -283,6 +286,7 @@ def test_fetch_processed(tmpdir):
No API token provided. Set CRASHSTATS_API_TOKEN in the environment.
Skipping dumps and protected data.
2ac9a763-83d2-4dca-89bb-091bd0220630: fetching processed crash
Completed in 0:00:00.
"""
)
data = pathlib.Path(tmpdir / "processed_crash" / crash_id).read_bytes()
Expand Down Expand Up @@ -329,6 +333,7 @@ def test_fetch_processed_with_token(tmpdir):
"""\
Using API token: 935exxxxxxxxxxxxxxxxxxxxxxxxxxxx
2ac9a763-83d2-4dca-89bb-091bd0220630: fetching processed crash
Completed in 0:00:00.
"""
)
data = pathlib.Path(tmpdir / "processed_crash" / crash_id).read_bytes()
Expand Down Expand Up @@ -379,9 +384,58 @@ def test_host(tmpdir):
No API token provided. Set CRASHSTATS_API_TOKEN in the environment.
Skipping dumps and protected data.
2ac9a763-83d2-4dca-89bb-091bd0220630: fetching raw crash
Completed in 0:00:00.
"""
)
data = pathlib.Path(
tmpdir / "raw_crash" / f"20{crash_id[-6:]}" / crash_id
).read_bytes()
assert json.loads(data) == raw_crash


@responses.activate
def test_stats(tmpdir):
crash_id = "2ac9a763-83d2-4dca-89bb-091bd0220630"
raw_crash = {
"ProductName": "Firefox",
"Version": "100.0",
}

responses.add(
responses.GET,
DEFAULT_HOST + "/api/RawCrash/",
match=[
responses.matchers.query_param_matcher(
{
"crash_id": crash_id,
"format": "meta",
}
)
],
status=200,
json=raw_crash,
)

runner = CliRunner()
args = [
"--raw",
"--no-dumps",
"--no-processed",
"--stats",
str(tmpdir),
crash_id,
]
result = runner.invoke(
cli=cmd_fetch_data.fetch_data,
args=args,
env={"COLUMNS": "100"},
)
assert result.exit_code == 0
assert result.output == dedent(
"""\
No API token provided. Set CRASHSTATS_API_TOKEN in the environment.
Skipping dumps and protected data.
Downloaded (0/1) 0:00:00
Completed in 0:00:00.
"""
)
1 change: 0 additions & 1 deletion tests/test_reprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,6 @@ def test_reprocess_tenthousand_allowmany():
"""
)
)
print(result.output.splitlines()[-3:])
assert result.output.endswith(
dedent(
f"""\
Expand Down
1 change: 0 additions & 1 deletion tests/test_supersearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,6 @@ def test_json():
args=["--_columns=uuid", "--_columns=signature", "--format=json"],
env={"COLUMNS": "100"},
)
print(result.output)
assert result.exit_code == 0
assert result.output == dedent(
"""\
Expand Down
1 change: 0 additions & 1 deletion tests/test_supersearchfacet.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,6 @@ def test_supersearch_url():
],
env={"COLUMNS": "100"},
)
print(result.output)
assert result.exit_code == 0
assert result.output == dedent(
"""\
Expand Down