From 8b0bb272004f2a5d9686514f96fb0bfbff14182d Mon Sep 17 00:00:00 2001 From: d066607 Date: Mon, 14 Oct 2024 11:19:35 +0200 Subject: [PATCH 1/2] adding safeguard alert * there was no critical alerting when vROps is gone entirely * playbook still needs to be added here before merging --- .../prometheus-vmware-rules/Chart.yaml | 2 +- .../alerts/vrops.alerts | 20 +++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/prometheus-rules/prometheus-vmware-rules/Chart.yaml b/prometheus-rules/prometheus-vmware-rules/Chart.yaml index 0e060d1600..892fee0631 100644 --- a/prometheus-rules/prometheus-vmware-rules/Chart.yaml +++ b/prometheus-rules/prometheus-vmware-rules/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 description: A collection of Prometheus alert rules. name: prometheus-vmware-rules -version: 1.0.7 +version: 1.0.8 dependencies: - name: owner-info repository: oci://keppel.eu-de-1.cloud.sap/ccloud-helm diff --git a/prometheus-rules/prometheus-vmware-rules/alerts/vrops.alerts b/prometheus-rules/prometheus-vmware-rules/alerts/vrops.alerts index 4674e3eb2e..9d10f1e80d 100644 --- a/prometheus-rules/prometheus-vmware-rules/alerts/vrops.alerts +++ b/prometheus-rules/prometheus-vmware-rules/alerts/vrops.alerts @@ -22,6 +22,26 @@ groups: which indicates that vrops is reporting internal server errors or is unreachable. Check if vrops is running and healthy. + - alert: VropsAPIDownEntirely + expr: | + abesent(vrops_api_response) + for: 10m + labels: + severity: critical + tier: vmware + service: compute + support_group: compute + context: vrops + playbook: docs/devops/alert/vcenter/vrops#vrops_api_down_entirely + dashboard: vrops-exporter-status + meta: "Vrops API is down. All collectors are not reporting anymore." + annotations: + meta: "Vrops API is down. All collectors are not reporting anymore." + description: | + Exporters can not connect to vrops anymore. Most likely the VM is stuck. + If this alert fires, complete vmware-montioring is down in Prometheus. + Ensure vrops is running and healthy. + - alert: VropsTokenAcquisitionFailed expr: vrops_api_response{get_request="token"} >= 500 for: 15m From 20dcaa0110395ab77ec05002c753b0b98c71e184 Mon Sep 17 00:00:00 2001 From: Tommy Sauer Date: Mon, 14 Oct 2024 16:42:26 +0200 Subject: [PATCH 2/2] typo Co-authored-by: Richard Tief <56597015+richardtief@users.noreply.github.com> --- prometheus-rules/prometheus-vmware-rules/alerts/vrops.alerts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prometheus-rules/prometheus-vmware-rules/alerts/vrops.alerts b/prometheus-rules/prometheus-vmware-rules/alerts/vrops.alerts index 9d10f1e80d..559178b56a 100644 --- a/prometheus-rules/prometheus-vmware-rules/alerts/vrops.alerts +++ b/prometheus-rules/prometheus-vmware-rules/alerts/vrops.alerts @@ -24,7 +24,7 @@ groups: - alert: VropsAPIDownEntirely expr: | - abesent(vrops_api_response) + absent(vrops_api_response) for: 10m labels: severity: critical