From 2cb92cc20934e6edf28f32ddcc5407f61eb1584e Mon Sep 17 00:00:00 2001 From: Johan Bloemberg Date: Wed, 18 Sep 2024 08:47:41 +0200 Subject: [PATCH] Improve alerting - Add more alerting test domains - alert if at least 2 domains fail - don't test mail on domains that have no MX - also measure if probe passed or not - don't measure scores for probes that don't count towards the total score (appsecpriv) --- docker/cron/periodic/15min/tests.py | 35 ++++-- docker/docker-compose.yml | 3 +- docker/grafana/dashboards/periodic-tests.json | 119 ++++++++++++++++-- 3 files changed, 140 insertions(+), 17 deletions(-) diff --git a/docker/cron/periodic/15min/tests.py b/docker/cron/periodic/15min/tests.py index 3da09951d..99417acdc 100755 --- a/docker/cron/periodic/15min/tests.py +++ b/docker/cron/periodic/15min/tests.py @@ -33,11 +33,28 @@ URL_BASE = f"http://{IPV4_IP_APP_INTERNAL}:8080" HEADERS = {"Host": INTERNETNL_DOMAINNAME} -# domain's to use in website tests -WEBSITE_TEST_DOMAINS = [ - "example.nl", - "example.com", -] +TEST_DOMAINS = { + # domain's to use in website tests + "site": [ + "example.nl", + "example.com", + "internetsociety.org", + "ripe.net", + "surf.nl", + "ecp.nl", + "forumstandaardisatie.nl", + "minez.nl", + ], + # domain's to use in mail tests + "mail": [ + "internetsociety.org", + "ripe.net", + "surf.nl", + "ecp.nl", + "forumstandaardisatie.nl", + "minez.nl", + ], +} METRIC_PROBE_DONE = Gauge("tests_probe_done_total", "Whether the probe completed.", ["test", "domain", "probe"]) @@ -46,6 +63,7 @@ "tests_probe_runtime_seconds", "Amount of time probe ran before done.", ["test", "domain", "probe"] ) METRIC_PROBE_SCORE = Gauge("tests_probe_score", "Score of the probe.", ["test", "domain", "probe"]) +METRIC_PROBE_PASSED = Gauge("tests_probe_pass", "Probe has passed.", ["test", "domain", "probe"]) METRIC_TEST_RUN = Gauge("tests_test_run_total", "Test that have been run.", ["test", "domain"]) METRIC_TEST_CACHE = Gauge("tests_test_cached_total", "Test runs that returned cached results.", ["test", "domain"]) @@ -119,14 +137,17 @@ def run_tests_on_domain(test, domain): r.raise_for_status() if r.status_code == 200: probe_result = r.json() - METRIC_PROBE_SCORE.labels(test, domain, probe_name).set(probe_result["totalscore"]) + # only measure probe scores that count towards total score + if probe_result["maxscore"]: + METRIC_PROBE_SCORE.labels(test, domain, probe_name).set(probe_result["totalscore"]) + METRIC_PROBE_PASSED.labels(test, domain, probe_name).set(probe_result["verdict"] == "passed") except Exception: log.exception("failed to get probe score") def run_tests(): for test in TESTS: - for domain in WEBSITE_TEST_DOMAINS: + for domain in TEST_DOMAINS[test]: log.info(f"testing: {test} {domain}") METRIC_TEST_RUN.labels(test, domain).set(1) METRIC_TEST_CACHE.labels(test, domain).set(0) diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 725006f59..43e675523 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -1073,7 +1073,8 @@ configs: - name: End to end monitoring rules: - alert: HighTestRuntime - expr: min(tests_test_runtime_seconds{test="site"})>=10 and max(tests_test_runtime_seconds{test="site"})>=30 + # when site probes for 2 or more of the test domains take longer than 30 seconds something is wrong + expr: count(tests_test_runtime_seconds{test="site"} >= 30) >= 2 annotations: host: $INTERNETNL_DOMAINNAME summary: Tests/probes take longer to complete than expected diff --git a/docker/grafana/dashboards/periodic-tests.json b/docker/grafana/dashboards/periodic-tests.json index 09bbdcfa3..62cbda080 100644 --- a/docker/grafana/dashboards/periodic-tests.json +++ b/docker/grafana/dashboards/periodic-tests.json @@ -38,6 +38,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "description": "", "fieldConfig": { "defaults": { "color": { @@ -53,7 +54,7 @@ }, { "color": "#EAB839", - "value": 30 + "value": 40 }, { "color": "red", @@ -95,13 +96,13 @@ "uid": "PBFA97CFB590B2093" }, "editorMode": "code", - "expr": "avg(tests_test_runtime_seconds{test=~\"$test\", domain=~\"$domain\"})", + "expr": "quantile( 0.50, tests_test_runtime_seconds{test=~\"$test\", domain=~\"$domain\"})", "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Average runtime", + "title": "Median runtime", "type": "stat" }, { @@ -167,7 +168,7 @@ "uid": "PBFA97CFB590B2093" }, "editorMode": "code", - "expr": "avg(tests_probe_score{test=~\"$test\", domain=~\"$domain\"})", + "expr": "avg(tests_probe_score{test=~\"$test\", domain=~\"$domain\", probe=~\"$probe\"})", "legendFormat": "__auto", "range": true, "refId": "A" @@ -670,7 +671,7 @@ "x": 0, "y": 18 }, - "id": 5, + "id": 14, "interval": "15m", "options": { "legend": { @@ -705,13 +706,113 @@ "title": "Probe scores per domain", "type": "timeseries" }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bool_yes_no" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 28 + }, + "id": 5, + "interval": "15m", + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "tests_probe_pass{test=~\"$test\", domain=~\"$domain\", probe=~\"$probe\"}", + "legendFormat": "{{test}} - {{domain}} - {{probe}}", + "range": true, + "refId": "A" + } + ], + "title": "Passed probes per domain", + "type": "timeseries" + }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 28 + "y": 38 }, "id": 6, "panels": [], @@ -781,7 +882,7 @@ "h": 10, "w": 24, "x": 0, - "y": 29 + "y": 39 }, "id": 7, "interval": "15m", @@ -925,6 +1026,6 @@ "timezone": "", "title": "Periodic tests", "uid": "af7d1d82-c0f9-4d8d-bc03-542c4c4c75c0", - "version": 2, + "version": 3, "weekStart": "" -} \ No newline at end of file +}