From 2cb92cc20934e6edf28f32ddcc5407f61eb1584e Mon Sep 17 00:00:00 2001
From: Johan Bloemberg <git@ijohan.nl>
Date: Wed, 18 Sep 2024 08:47:41 +0200
Subject: [PATCH] Improve alerting

- Add more alerting test domains
- alert if at least 2 domains fail
- don't test mail on domains that have no MX
- also measure if probe passed or not
- don't measure scores for probes that don't count towards the total score (appsecpriv)
---
 docker/cron/periodic/15min/tests.py           |  35 ++++--
 docker/docker-compose.yml                     |   3 +-
 docker/grafana/dashboards/periodic-tests.json | 119 ++++++++++++++++--
 3 files changed, 140 insertions(+), 17 deletions(-)

diff --git a/docker/cron/periodic/15min/tests.py b/docker/cron/periodic/15min/tests.py
index 3da09951d..99417acdc 100755
--- a/docker/cron/periodic/15min/tests.py
+++ b/docker/cron/periodic/15min/tests.py
@@ -33,11 +33,28 @@
 URL_BASE = f"http://{IPV4_IP_APP_INTERNAL}:8080"
 HEADERS = {"Host": INTERNETNL_DOMAINNAME}
 
-# domain's to use in website tests
-WEBSITE_TEST_DOMAINS = [
-    "example.nl",
-    "example.com",
-]
+TEST_DOMAINS = {
+    # domain's to use in website tests
+    "site": [
+        "example.nl",
+        "example.com",
+        "internetsociety.org",
+        "ripe.net",
+        "surf.nl",
+        "ecp.nl",
+        "forumstandaardisatie.nl",
+        "minez.nl",
+    ],
+    # domain's to use in mail tests
+    "mail": [
+        "internetsociety.org",
+        "ripe.net",
+        "surf.nl",
+        "ecp.nl",
+        "forumstandaardisatie.nl",
+        "minez.nl",
+    ],
+}
 
 
 METRIC_PROBE_DONE = Gauge("tests_probe_done_total", "Whether the probe completed.", ["test", "domain", "probe"])
@@ -46,6 +63,7 @@
     "tests_probe_runtime_seconds", "Amount of time probe ran before done.", ["test", "domain", "probe"]
 )
 METRIC_PROBE_SCORE = Gauge("tests_probe_score", "Score of the probe.", ["test", "domain", "probe"])
+METRIC_PROBE_PASSED = Gauge("tests_probe_pass", "Probe has passed.", ["test", "domain", "probe"])
 
 METRIC_TEST_RUN = Gauge("tests_test_run_total", "Test that have been run.", ["test", "domain"])
 METRIC_TEST_CACHE = Gauge("tests_test_cached_total", "Test runs that returned cached results.", ["test", "domain"])
@@ -119,14 +137,17 @@ def run_tests_on_domain(test, domain):
             r.raise_for_status()
             if r.status_code == 200:
                 probe_result = r.json()
-                METRIC_PROBE_SCORE.labels(test, domain, probe_name).set(probe_result["totalscore"])
+                # only measure probe scores that count towards total score
+                if probe_result["maxscore"]:
+                    METRIC_PROBE_SCORE.labels(test, domain, probe_name).set(probe_result["totalscore"])
+                METRIC_PROBE_PASSED.labels(test, domain, probe_name).set(probe_result["verdict"] == "passed")
         except Exception:
             log.exception("failed to get probe score")
 
 
 def run_tests():
     for test in TESTS:
-        for domain in WEBSITE_TEST_DOMAINS:
+        for domain in TEST_DOMAINS[test]:
             log.info(f"testing: {test} {domain}")
             METRIC_TEST_RUN.labels(test, domain).set(1)
             METRIC_TEST_CACHE.labels(test, domain).set(0)
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
index 725006f59..43e675523 100644
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -1073,7 +1073,8 @@ configs:
       - name: End to end monitoring
         rules:
         - alert: HighTestRuntime
-          expr: min(tests_test_runtime_seconds{test="site"})>=10 and max(tests_test_runtime_seconds{test="site"})>=30
+          # when site probes for 2 or more of the test domains take longer than 30 seconds something is wrong
+          expr: count(tests_test_runtime_seconds{test="site"} >= 30) >= 2
           annotations:
             host: $INTERNETNL_DOMAINNAME
             summary: Tests/probes take longer to complete than expected
diff --git a/docker/grafana/dashboards/periodic-tests.json b/docker/grafana/dashboards/periodic-tests.json
index 09bbdcfa3..62cbda080 100644
--- a/docker/grafana/dashboards/periodic-tests.json
+++ b/docker/grafana/dashboards/periodic-tests.json
@@ -38,6 +38,7 @@
         "type": "prometheus",
         "uid": "PBFA97CFB590B2093"
       },
+      "description": "",
       "fieldConfig": {
         "defaults": {
           "color": {
@@ -53,7 +54,7 @@
               },
               {
                 "color": "#EAB839",
-                "value": 30
+                "value": 40
               },
               {
                 "color": "red",
@@ -95,13 +96,13 @@
             "uid": "PBFA97CFB590B2093"
           },
           "editorMode": "code",
-          "expr": "avg(tests_test_runtime_seconds{test=~\"$test\", domain=~\"$domain\"})",
+          "expr": "quantile( 0.50, tests_test_runtime_seconds{test=~\"$test\", domain=~\"$domain\"})",
           "legendFormat": "__auto",
           "range": true,
           "refId": "A"
         }
       ],
-      "title": "Average runtime",
+      "title": "Median runtime",
       "type": "stat"
     },
     {
@@ -167,7 +168,7 @@
             "uid": "PBFA97CFB590B2093"
           },
           "editorMode": "code",
-          "expr": "avg(tests_probe_score{test=~\"$test\", domain=~\"$domain\"})",
+          "expr": "avg(tests_probe_score{test=~\"$test\", domain=~\"$domain\", probe=~\"$probe\"})",
           "legendFormat": "__auto",
           "range": true,
           "refId": "A"
@@ -670,7 +671,7 @@
         "x": 0,
         "y": 18
       },
-      "id": 5,
+      "id": 14,
       "interval": "15m",
       "options": {
         "legend": {
@@ -705,13 +706,113 @@
       "title": "Probe scores per domain",
       "type": "timeseries"
     },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "PBFA97CFB590B2093"
+      },
+      "description": "",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "bool_yes_no"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 10,
+        "w": 24,
+        "x": 0,
+        "y": 28
+      },
+      "id": 5,
+      "interval": "15m",
+      "options": {
+        "legend": {
+          "calcs": [
+            "min",
+            "max",
+            "mean",
+            "lastNotNull"
+          ],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "PBFA97CFB590B2093"
+          },
+          "editorMode": "code",
+          "expr": "tests_probe_pass{test=~\"$test\", domain=~\"$domain\", probe=~\"$probe\"}",
+          "legendFormat": "{{test}} - {{domain}} - {{probe}}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Passed probes per domain",
+      "type": "timeseries"
+    },
     {
       "collapsed": false,
       "gridPos": {
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 28
+        "y": 38
       },
       "id": 6,
       "panels": [],
@@ -781,7 +882,7 @@
         "h": 10,
         "w": 24,
         "x": 0,
-        "y": 29
+        "y": 39
       },
       "id": 7,
       "interval": "15m",
@@ -925,6 +1026,6 @@
   "timezone": "",
   "title": "Periodic tests",
   "uid": "af7d1d82-c0f9-4d8d-bc03-542c4c4c75c0",
-  "version": 2,
+  "version": 3,
   "weekStart": ""
-}
\ No newline at end of file
+}