Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add more alerting test domains and alert if at least 2 domains fail #1508

Merged
merged 3 commits into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,8 @@ jobs:
curl -sSfO https://raw.githubusercontent.com/${{github.repository}}/\$BRANCH/docker/user_manage.sh && \\
chmod 755 user_manage.sh && \\
env -i RELEASE="\$RELEASE" DOCKER_REGISTRY="\$DOCKER_REGISTRY" docker compose --env-file=docker/defaults.env --env-file=docker/host.env --env-file=docker/local.env pull && \\
# temporary solution to recreate containers when configs change: https://github.com/internetstandards/Internet.nl/issues/1490 \\
env -i RELEASE="\$RELEASE" DOCKER_REGISTRY="\$DOCKER_REGISTRY" docker compose --env-file=docker/defaults.env --env-file=docker/host.env --env-file=docker/local.env rm --stop --force cron-docker prometheus alertmanager nginx_logs_exporter && \\
env -i RELEASE="\$RELEASE" DOCKER_REGISTRY="\$DOCKER_REGISTRY" docker compose --env-file=docker/defaults.env --env-file=docker/host.env --env-file=docker/local.env up --remove-orphans --wait --no-build
EOF
Expand All @@ -246,6 +248,8 @@ jobs:
curl -sSfO https://raw.githubusercontent.com/${{github.repository}}/\$BRANCH/docker/user_manage.sh && \\
chmod 755 user_manage.sh && \\
env -i RELEASE="\$RELEASE" DOCKER_REGISTRY="\$DOCKER_REGISTRY" docker compose --env-file=docker/defaults.env --env-file=docker/host.env --env-file=docker/local.env pull && \\
# temporary solution to recreate containers when configs change: https://github.com/internetstandards/Internet.nl/issues/1490 \\
env -i RELEASE="\$RELEASE" DOCKER_REGISTRY="\$DOCKER_REGISTRY" docker compose --env-file=docker/defaults.env --env-file=docker/host.env --env-file=docker/local.env rm --stop --force cron-docker prometheus alertmanager nginx_logs_exporter && \\
env -i RELEASE="\$RELEASE" DOCKER_REGISTRY="\$DOCKER_REGISTRY" docker compose --env-file=docker/defaults.env --env-file=docker/host.env --env-file=docker/local.env up --remove-orphans --wait --no-build
EOF
Expand Down
2 changes: 1 addition & 1 deletion docker/batch-test.env
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ INTERNETNL_DOMAINNAME=internet.test
# use unique name to not conflict with integration tests
COMPOSE_PROJECT_NAME=internetnl-batch-test

COMPOSE_PROFILES=batch
COMPOSE_PROFILES=batch,cron

ENABLE_BATCH=True

Expand Down
2 changes: 1 addition & 1 deletion docker/build.env
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# include all compose files
COMPOSE_FILE=docker/docker-compose.yml:docker/docker-compose-development.yml:docker/docker-compose-integration-tests.yml:docker/docker-compose-test.yml:docker/docker-compose-tools.yml:docker/docker-compose-test-runner-develop.yml:docker/docker-compose-integration-tests.yml
COMPOSE_PROFILES=build
COMPOSE_PROFILES=default,cron,batch,build

# don't expose HTTP(S) and DNS ports to the outside, this also causes issues due to being privileged ports
WEBSERVER_PORT=80
Expand Down
69 changes: 58 additions & 11 deletions docker/cron/periodic/15min/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
# file to write metrics to https://github.com/prometheus/node_exporter?tab=readme-ov-file#textfile-collector
OUTPUT_TEXTFILE = "/prometheus-textfile-directory/tests.prom"

TEST_TIMEOUT = int(os.environ.get("INTERNETNL_CACHE_TTL"))

DEFAULT_TEST_TIMEOUT = 200
TEST_TIMEOUT = int(os.environ.get("INTERNETNL_CACHE_TTL", DEFAULT_TEST_TIMEOUT))
REQUEST_TIMEOUT = 30

TESTS = ["site", "mail"]
Expand All @@ -33,29 +35,53 @@
URL_BASE = f"http://{IPV4_IP_APP_INTERNAL}:8080"
HEADERS = {"Host": INTERNETNL_DOMAINNAME}

# domain's to use in website tests
WEBSITE_TEST_DOMAINS = [
"example.nl",
"example.com",
]

TEST_DOMAINS = {
# domain's to use in website tests
"site": [
"internet.nl",
"example.nl",
"example.com",
"internetsociety.org",
"ripe.net",
"surf.nl",
"ecp.nl",
"forumstandaardisatie.nl",
"minez.nl",
],
# domain's to use in mail tests
"mail": [
"internetsociety.org",
"ripe.net",
"surf.nl",
"ecp.nl",
# these are currently really slow and will probably improve when
# we switch to sslyze, for now disable these in monitoring
# "internet.nl",
# "forumstandaardisatie.nl",
# "minez.nl",
],
}

METRIC_PROBE_DONE = Gauge("tests_probe_done_total", "Whether the probe completed.", ["test", "domain", "probe"])
METRIC_PROBE_SUCCESS = Gauge("tests_probe_success_total", "Whether the probe succeeded.", ["test", "domain", "probe"])
METRIC_PROBE_RUNTIME = Gauge(
"tests_probe_runtime_seconds", "Amount of time probe ran before done.", ["test", "domain", "probe"]
)
METRIC_PROBE_SCORE = Gauge("tests_probe_score", "Score of the probe.", ["test", "domain", "probe"])
METRIC_PROBE_PASSED = Gauge("tests_probe_pass", "Probe has passed.", ["test", "domain", "probe"])

METRIC_TEST_RUN = Gauge("tests_test_run_total", "Test that have been run.", ["test", "domain"])
METRIC_TEST_CACHE = Gauge("tests_test_cached_total", "Test runs that returned cached results.", ["test", "domain"])
METRIC_TEST_FAILURE = Gauge("tests_test_failure_total", "Test runs that failed.", ["test", "domain"])
METRIC_TEST_SUCCESS = Gauge("tests_test_success_total", "Test runs that succeeded.", ["test", "domain"])
METRIC_TEST_TIMEOUT = Gauge("tests_test_timeout", "Test that ran into timeout.", ["test", "domain"])
METRIC_TEST_TIMEOUT = Gauge("tests_test_timeout_total", "Test that ran into timeout.", ["test", "domain"])
METRIC_TEST_RUNTIME = Gauge("tests_test_runtime_seconds", "Amount of time test ran before done.", ["test", "domain"])
METRIC_TEST_SCORE = Gauge("tests_test_score", "Total score of all probes in the test.", ["test", "domain"])


def run_tests_on_domain(test, domain):
test_start = int(time.time())

# initiate the test
r = requests.get(
f"{URL_BASE}/{test}/probes/{domain}/?{time.time()}",
Expand All @@ -73,7 +99,6 @@ def run_tests_on_domain(test, domain):
return

# poll probes until done
test_start = int(time.time())
finished_probes = set()
while int(time.time()) < test_start + TEST_TIMEOUT:
# get probe status
Expand All @@ -99,15 +124,26 @@ def run_tests_on_domain(test, domain):

# stop when all probes are finished
if not [p for p in probes if not p["done"]]:
METRIC_TEST_SUCCESS.labels(test, domain).set(1)
break

time.sleep(1)
else:
# test timed out because one or more of the probes was not done within time
METRIC_TEST_TIMEOUT.labels(test, domain).set(1)
for probe in probes:
if probe["name"] in finished_probes:
continue
# record not finished probes as failed
METRIC_PROBE_DONE.labels(test, domain, probe["name"]).set(probe["done"])
METRIC_PROBE_RUNTIME.labels(test, domain, probe["name"]).set(int(time.time() - test_start))
if probe["done"]:
METRIC_PROBE_SUCCESS.labels(test, domain, probe["name"]).set(probe["success"])

METRIC_TEST_RUNTIME.labels(test, domain).set(int(time.time() - test_start))

# get additional metrics like score
scores = list()
for probe_name in finished_probes:
try:
r = requests.get(
Expand All @@ -119,18 +155,29 @@ def run_tests_on_domain(test, domain):
r.raise_for_status()
if r.status_code == 200:
probe_result = r.json()
METRIC_PROBE_SCORE.labels(test, domain, probe_name).set(probe_result["totalscore"])
# only measure probe scores that count towards total score
if probe_result["maxscore"]:
METRIC_PROBE_SCORE.labels(test, domain, probe_name).set(probe_result["totalscore"])
scores.append(probe_result["totalscore"])
METRIC_PROBE_PASSED.labels(test, domain, probe_name).set(probe_result["verdict"] == "passed")
except Exception:
log.exception("failed to get probe score")

if scores:
METRIC_TEST_SCORE.labels(test, domain).set(max(min(int(sum(scores) / len(scores)), 100), 0))
else:
METRIC_TEST_SCORE.labels(test, domain).set(0)


def run_tests():
for test in TESTS:
for domain in WEBSITE_TEST_DOMAINS:
for domain in TEST_DOMAINS[test]:
log.info(f"testing: {test} {domain}")
METRIC_TEST_RUN.labels(test, domain).set(1)
METRIC_TEST_CACHE.labels(test, domain).set(0)
METRIC_TEST_FAILURE.labels(test, domain).set(0)
METRIC_TEST_TIMEOUT.labels(test, domain).set(0)
METRIC_TEST_SUCCESS.labels(test, domain).set(0)
try:
run_tests_on_domain(test, domain)
except Exception:
Expand Down
4 changes: 2 additions & 2 deletions docker/defaults.env
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,8 @@ COMPOSE_PROJECT_NAME=internetnl
# configure which compose files are included in this environment
COMPOSE_FILE=docker/docker-compose.yml

# enable all services
COMPOSE_PROFILES=default
# enable all services and crons
COMPOSE_PROFILES=default,cron

# interval for batch processing
BATCH_SCHEDULER_INTERVAL=1
Expand Down
5 changes: 4 additions & 1 deletion docker/develop.env
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,7 @@ CRON_15MIN_RUN_TESTS=False
INTERNETNL_BRANDING=False

# disable caching in development environment
NGINX_PROXY_CACHE=off
NGINX_PROXY_CACHE=off

# only enable application services
COMPOSE_PROFILES=default
22 changes: 19 additions & 3 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -710,6 +710,9 @@ services:
start_period: 1m
retries: 10

profiles:
- cron

# cron daemon with access to Docker socket but no networking
cron-docker:
image: alpinelinux/docker-cli:latest
Expand Down Expand Up @@ -739,6 +742,9 @@ services:
volumes:
- /var/run/docker.sock:/var/run/docker.sock

profiles:
- cron

grafana:
image: ${DOCKER_IMAGE_GRAFANA:-${DOCKER_REGISTRY:-ghcr.io/internetstandards}/grafana:${RELEASE:-latest}}
build:
Expand Down Expand Up @@ -1072,12 +1078,22 @@ configs:
groups:
- name: End to end monitoring
rules:
- alert: HighTestRuntime
expr: min(tests_test_runtime_seconds{test="site"})>=10 and max(tests_test_runtime_seconds{test="site"})>=30
- alert: HighTestRuntimeSite
# when site probes for 2 or more of the test domains take longer than 30 seconds something is wrong
expr: count(tests_test_runtime_seconds{test="site"} >= 30) >= 2
annotations:
host: $INTERNETNL_DOMAINNAME
summary: Tests/probes take longer to complete than expected
summary: Two or more tests for web take longer to complete than expected
dashboard: 'https://$INTERNETNL_DOMAINNAME/grafana/d/af7d1d82-c0f9-4d8d-bc03-542c4c4c75c0/periodic-tests'
- alert: HighTestRuntimeMail
# when mail probes for 2 or more of the oltest domains take longer than 70 seconds something is wrong
expr: count(tests_test_runtime_seconds{test="site"} >= 70) >= 2
annotations:
host: $INTERNETNL_DOMAINNAME
summary: Two or more tests for mail take longer to complete than expected
dashboard: 'https://$INTERNETNL_DOMAINNAME/grafana/d/af7d1d82-c0f9-4d8d-bc03-542c4c4c75c0/periodic-tests'


alertmanager_config:
content: |
global:
Expand Down
Loading