Skip to content

Commit

Permalink
Improve alerting
Browse files Browse the repository at this point in the history
- Add more alerting test domains
- alert if at least 2 domains fail
- don't test mail on domains that have no MX
- also measure if probe passed or not
- don't measure scores for probes that don't count towards the total score (appsecpriv)
  • Loading branch information
aequitas committed Sep 18, 2024
1 parent 6d587d8 commit 2cb92cc
Show file tree
Hide file tree
Showing 3 changed files with 140 additions and 17 deletions.
35 changes: 28 additions & 7 deletions docker/cron/periodic/15min/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,28 @@
URL_BASE = f"http://{IPV4_IP_APP_INTERNAL}:8080"
HEADERS = {"Host": INTERNETNL_DOMAINNAME}

# domain's to use in website tests
WEBSITE_TEST_DOMAINS = [
"example.nl",
"example.com",
]
TEST_DOMAINS = {
# domain's to use in website tests
"site": [
"example.nl",
"example.com",
"internetsociety.org",
"ripe.net",
"surf.nl",
"ecp.nl",
"forumstandaardisatie.nl",
"minez.nl",
],
# domain's to use in mail tests
"mail": [
"internetsociety.org",
"ripe.net",
"surf.nl",
"ecp.nl",
"forumstandaardisatie.nl",
"minez.nl",
],
}


METRIC_PROBE_DONE = Gauge("tests_probe_done_total", "Whether the probe completed.", ["test", "domain", "probe"])
Expand All @@ -46,6 +63,7 @@
"tests_probe_runtime_seconds", "Amount of time probe ran before done.", ["test", "domain", "probe"]
)
METRIC_PROBE_SCORE = Gauge("tests_probe_score", "Score of the probe.", ["test", "domain", "probe"])
METRIC_PROBE_PASSED = Gauge("tests_probe_pass", "Probe has passed.", ["test", "domain", "probe"])

METRIC_TEST_RUN = Gauge("tests_test_run_total", "Test that have been run.", ["test", "domain"])
METRIC_TEST_CACHE = Gauge("tests_test_cached_total", "Test runs that returned cached results.", ["test", "domain"])
Expand Down Expand Up @@ -119,14 +137,17 @@ def run_tests_on_domain(test, domain):
r.raise_for_status()
if r.status_code == 200:
probe_result = r.json()
METRIC_PROBE_SCORE.labels(test, domain, probe_name).set(probe_result["totalscore"])
# only measure probe scores that count towards total score
if probe_result["maxscore"]:
METRIC_PROBE_SCORE.labels(test, domain, probe_name).set(probe_result["totalscore"])
METRIC_PROBE_PASSED.labels(test, domain, probe_name).set(probe_result["verdict"] == "passed")
except Exception:
log.exception("failed to get probe score")


def run_tests():
for test in TESTS:
for domain in WEBSITE_TEST_DOMAINS:
for domain in TEST_DOMAINS[test]:
log.info(f"testing: {test} {domain}")
METRIC_TEST_RUN.labels(test, domain).set(1)
METRIC_TEST_CACHE.labels(test, domain).set(0)
Expand Down
3 changes: 2 additions & 1 deletion docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1073,7 +1073,8 @@ configs:
- name: End to end monitoring
rules:
- alert: HighTestRuntime
expr: min(tests_test_runtime_seconds{test="site"})>=10 and max(tests_test_runtime_seconds{test="site"})>=30
# when site probes for 2 or more of the test domains take longer than 30 seconds something is wrong
expr: count(tests_test_runtime_seconds{test="site"} >= 30) >= 2
annotations:
host: $INTERNETNL_DOMAINNAME
summary: Tests/probes take longer to complete than expected
Expand Down
119 changes: 110 additions & 9 deletions docker/grafana/dashboards/periodic-tests.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
Expand All @@ -53,7 +54,7 @@
},
{
"color": "#EAB839",
"value": 30
"value": 40
},
{
"color": "red",
Expand Down Expand Up @@ -95,13 +96,13 @@
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "avg(tests_test_runtime_seconds{test=~\"$test\", domain=~\"$domain\"})",
"expr": "quantile( 0.50, tests_test_runtime_seconds{test=~\"$test\", domain=~\"$domain\"})",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Average runtime",
"title": "Median runtime",
"type": "stat"
},
{
Expand Down Expand Up @@ -167,7 +168,7 @@
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "avg(tests_probe_score{test=~\"$test\", domain=~\"$domain\"})",
"expr": "avg(tests_probe_score{test=~\"$test\", domain=~\"$domain\", probe=~\"$probe\"})",
"legendFormat": "__auto",
"range": true,
"refId": "A"
Expand Down Expand Up @@ -670,7 +671,7 @@
"x": 0,
"y": 18
},
"id": 5,
"id": 14,
"interval": "15m",
"options": {
"legend": {
Expand Down Expand Up @@ -705,13 +706,113 @@
"title": "Probe scores per domain",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bool_yes_no"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 24,
"x": 0,
"y": 28
},
"id": 5,
"interval": "15m",
"options": {
"legend": {
"calcs": [
"min",
"max",
"mean",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "tests_probe_pass{test=~\"$test\", domain=~\"$domain\", probe=~\"$probe\"}",
"legendFormat": "{{test}} - {{domain}} - {{probe}}",
"range": true,
"refId": "A"
}
],
"title": "Passed probes per domain",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 28
"y": 38
},
"id": 6,
"panels": [],
Expand Down Expand Up @@ -781,7 +882,7 @@
"h": 10,
"w": 24,
"x": 0,
"y": 29
"y": 39
},
"id": 7,
"interval": "15m",
Expand Down Expand Up @@ -925,6 +1026,6 @@
"timezone": "",
"title": "Periodic tests",
"uid": "af7d1d82-c0f9-4d8d-bc03-542c4c4c75c0",
"version": 2,
"version": 3,
"weekStart": ""
}
}

0 comments on commit 2cb92cc

Please sign in to comment.