Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Report reimage failures to Sentry #1875

Merged
merged 2 commits into from
Jul 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions teuthology/dispatcher/supervisor.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from teuthology.task import internal
from teuthology.misc import decanonicalize_hostname as shortname
from teuthology.lock import query
from teuthology.util import sentry

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -232,6 +233,7 @@ def reimage(job_config):
ctx.config,
dict(status='dead', failure_reason='Error reimaging machines: ' + str(e))
)
ctx.summary['sentry_event'] = sentry.report_error(job_config, e)
nuke.nuke(ctx, True)
# Machine that fails to reimage after 10 times will be marked down
check_for_reimage_failures_and_mark_down(targets)
Expand Down
44 changes: 3 additions & 41 deletions teuthology/run_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@
import types
import yaml

from copy import deepcopy
from humanfriendly import format_timespan
import sentry_sdk

import teuthology.exporter as exporter

Expand All @@ -18,6 +16,7 @@
from teuthology.job_status import set_status, get_status
from teuthology.misc import get_http_log_path, get_results_url
from teuthology.timer import Timer
from teuthology.util import sentry

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -94,6 +93,7 @@ def run_tasks(tasks, ctx):
else:
timer = Timer()
stack = []
taskname = ""
try:
for taskdict in tasks:
try:
Expand All @@ -119,45 +119,7 @@ def run_tasks(tasks, ctx):
ctx.summary['failure_reason'] = str(e)
log.exception('Saw exception from tasks.')

if teuth_config.sentry_dsn:
sentry_sdk.init(teuth_config.sentry_dsn)
config = deepcopy(ctx.config)

tags = {
'task': taskname,
'owner': ctx.owner,
}
optional_tags = ('teuthology_branch', 'branch', 'suite',
'machine_type', 'os_type', 'os_version')
for tag in optional_tags:
if tag in config:
tags[tag] = config[tag]

# Remove ssh keys from reported config
if 'targets' in config:
targets = config['targets']
for host in targets.keys():
targets[host] = '<redacted>'

job_id = ctx.config.get('job_id')
archive_path = ctx.config.get('archive_path')
extras = dict(config=config,
)
if job_id:
extras['logs'] = get_http_log_path(archive_path, job_id)

fingerprint = e.fingerprint() if hasattr(e, 'fingerprint') else None
exc_id = sentry_sdk.capture_exception(
error=e,
tags=tags,
extras=extras,
fingerprint=fingerprint,
)
event_url = "{server}/?query={id}".format(
server=teuth_config.sentry_server.strip('/'), id=exc_id)
log.exception(" Sentry event: %s" % event_url)
ctx.summary['sentry_event'] = event_url

ctx.summary['sentry_event'] = sentry.report_error(ctx.config, e, taskname)
if ctx.config.get('interactive-on-error'):
ctx.config['interactive-on-error'] = False
from teuthology.task import interactive
Expand Down
52 changes: 52 additions & 0 deletions teuthology/util/sentry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import logging
import sentry_sdk

from copy import deepcopy

from teuthology.config import config as teuth_config
from teuthology.misc import get_http_log_path

log = logging.getLogger(__name__)


def report_error(job_config, exception, task_name=None):
if not teuth_config.sentry_dsn:
return None
sentry_sdk.init(teuth_config.sentry_dsn)
job_config = deepcopy(job_config)

tags = {
'task': task_name,
'owner': job_config.get("owner"),
}
optional_tags = ('teuthology_branch', 'branch', 'suite',
'machine_type', 'os_type', 'os_version')
for tag in optional_tags:
if tag in job_config:
tags[tag] = job_config[tag]

# Remove ssh keys from reported config
if 'targets' in job_config:
targets = job_config['targets']
for host in targets.keys():
targets[host] = '<redacted>'

job_id = job_config.get('job_id')
archive_path = job_config.get('archive_path')
extras = dict(config=job_config)
if job_id:
extras['logs'] = get_http_log_path(archive_path, job_id)

fingerprint = exception.fingerprint() if hasattr(exception, 'fingerprint') else None
exc_id = sentry_sdk.capture_exception(
error=exception,
tags=tags,
extras=extras,
fingerprint=fingerprint,
)
event_url = "{server}/?query={id}".format(
server=teuth_config.sentry_server.strip('/'), id=exc_id)
log.exception(" Sentry event: %s" % event_url)
return event_url


Loading