From db53d3a02299dfb49ad542b0f1055dca97b8f33a Mon Sep 17 00:00:00 2001 From: Martin Vossen Date: Fri, 20 Jul 2018 10:31:48 +0200 Subject: [PATCH] separate more openstack canary alerts --- .../prometheus-frontend/openstack-arc.alerts | 46 +++++++++++++++++ .../openstack-blackbox.alerts | 4 +- .../openstack-cinder.alerts | 49 +++++++++++++++++- .../openstack-designate.alerts | 46 +++++++++++++++++ .../openstack-hermes.alerts | 49 +++++++++++++++++- .../openstack-lbaas.alerts | 49 +++++++++++++++++- .../openstack-manila.alerts | 50 ++++++++++++++++++- .../openstack-swift.alerts | 49 +++++++++++++++++- 8 files changed, 335 insertions(+), 7 deletions(-) diff --git a/system/kube-monitoring/charts/prometheus-frontend/openstack-arc.alerts b/system/kube-monitoring/charts/prometheus-frontend/openstack-arc.alerts index 987ad2fc69..0badaebe5a 100644 --- a/system/kube-monitoring/charts/prometheus-frontend/openstack-arc.alerts +++ b/system/kube-monitoring/charts/prometheus-frontend/openstack-arc.alerts @@ -45,3 +45,49 @@ groups: description: '{{ $labels.check }} API is flapping for 30 minutes.' summary: '{{ $labels.check }} API flapping' + - alert: OpenstackArcCanaryDown + expr: blackbox_canary_status_gauge{service=~"arc"} == 1 + for: 1h + labels: + severity: warning + tier: openstack + service: '{{ $labels.service }}' + context: '{{ $labels.service }}' + dashboard: ccloud-health-canary-details + meta: 'Canary {{ $labels.service }} {{ $labels.check }} is down for 1 hour. See Sentry for details' + sentry: 'blackbox/?query=test_{{ $labels.check }}' + playbook: 'docs/devops/alert/{{ $labels.service }}' + annotations: + description: 'Canary {{ $labels.service }} {{ $labels.check }} is down for 1 hour. See Sentry for details' + summary: 'Canary {{ $labels.service }} {{ $labels.check }} is down' + + - alert: OpenstackArcCanaryTimeout + expr: blackbox_canary_status_gauge{service=~"arc"} == 0.5 + for: 1h + labels: + severity: info + tier: openstack + service: '{{ $labels.service }}' + context: '{{ $labels.service }}' + dashboard: ccloud-health-canary-details + meta: 'Canary {{ $labels.service }} {{ $labels.check }} is timing out for 1 hour. See Sentry for details' + sentry: 'blackbox/?query=test_{{ $labels.check }}' + playbook: 'docs/devops/alert/{{ $labels.service }}' + annotations: + description: 'Canary {{ $labels.service }} {{ $labels.check }} is timing out for 1 hour. See Sentry for details' + summary: 'Canary {{ $labels.service }} {{ $labels.check }} is timing out' + + - alert: OpenstackArcCanaryFlapping + expr: changes(blackbox_canary_status_gauge{service=~"arc"}[2h]) > 8 + labels: + severity: warning + tier: openstack + service: '{{ $labels.service }}' + context: '{{ $labels.service }}' + dashboard: ccloud-health-canary-details + meta: 'Canary {{ $labels.service }} {{ $labels.check }} is flapping for 2 hours. See Sentry for details' + sentry: 'blackbox/?query=test_{{ $labels.check }}' + playbook: 'docs/devops/alert/{{ $labels.service }}' + annotations: + description: 'Canary {{ $labels.service }} {{ $labels.check }} is flapping for 2 hours. See Sentry for details' + summary: 'Canary {{ $labels.service }} {{ $labels.check }} is flapping' \ No newline at end of file diff --git a/system/kube-monitoring/charts/prometheus-frontend/openstack-blackbox.alerts b/system/kube-monitoring/charts/prometheus-frontend/openstack-blackbox.alerts index 04033c827c..65db59ac5f 100644 --- a/system/kube-monitoring/charts/prometheus-frontend/openstack-blackbox.alerts +++ b/system/kube-monitoring/charts/prometheus-frontend/openstack-blackbox.alerts @@ -80,7 +80,7 @@ groups: summary: Blackbox datapath test - alert: OpenstackCanaryHealthCheckFailing - expr: blackbox_canary_status_gauge{service!="nova", service!="neutron"} == 1 + expr: blackbox_canary_status_gauge{service!="nova", service!="neutron", service!="cinder", service!="manila", service!="lbaas", service!="swift", service!="designate", service!="arc", service!="hermes"} == 1 for: 1h labels: severity: warning @@ -96,7 +96,7 @@ groups: summary: Blackbox canary test - alert: OpenstackCanaryHealthCheckFlapping - expr: changes(blackbox_canary_status_gauge{service!="nova", service!="neutron"}[2h]) > 8 + expr: changes(blackbox_canary_status_gauge{service!="nova", service!="neutron", service!="cinder", service!="manila", service!="lbaas", service!="swift", service!="designate", service!="arc", service!="hermes"}[2h]) > 8 labels: severity: warning tier: openstack diff --git a/system/kube-monitoring/charts/prometheus-frontend/openstack-cinder.alerts b/system/kube-monitoring/charts/prometheus-frontend/openstack-cinder.alerts index 0f68d8a5e1..e076477ae8 100644 --- a/system/kube-monitoring/charts/prometheus-frontend/openstack-cinder.alerts +++ b/system/kube-monitoring/charts/prometheus-frontend/openstack-cinder.alerts @@ -72,4 +72,51 @@ groups: playbook: 'docs/devops/alert/{{ $labels.service }}/#{{ $labels.check }}' annotations: description: '{{ $labels.check }} API is flapping for 30 minutes.' - summary: '{{ $labels.check }} API flapping' \ No newline at end of file + summary: '{{ $labels.check }} API flapping' + + - alert: OpenstackCinderCanaryDown + expr: blackbox_canary_status_gauge{service=~"cinder"} == 1 + for: 1h + labels: + severity: warning + tier: openstack + service: '{{ $labels.service }}' + context: '{{ $labels.service }}' + dashboard: ccloud-health-canary-details + meta: 'Canary {{ $labels.service }} {{ $labels.check }} is down for 1 hour. See Sentry for details' + sentry: 'blackbox/?query=test_{{ $labels.check }}' + playbook: 'docs/devops/alert/{{ $labels.service }}' + annotations: + description: 'Canary {{ $labels.service }} {{ $labels.check }} is down for 1 hour. See Sentry for details' + summary: 'Canary {{ $labels.service }} {{ $labels.check }} is down' + + - alert: OpenstackCinderCanaryTimeout + expr: blackbox_canary_status_gauge{service=~"cinder"} == 0.5 + for: 1h + labels: + severity: info + tier: openstack + service: '{{ $labels.service }}' + context: '{{ $labels.service }}' + dashboard: ccloud-health-canary-details + meta: 'Canary {{ $labels.service }} {{ $labels.check }} is timing out for 1 hour. See Sentry for details' + sentry: 'blackbox/?query=test_{{ $labels.check }}' + playbook: 'docs/devops/alert/{{ $labels.service }}' + annotations: + description: 'Canary {{ $labels.service }} {{ $labels.check }} is timing out for 1 hour. See Sentry for details' + summary: 'Canary {{ $labels.service }} {{ $labels.check }} is timing out' + + - alert: OpenstackCinderCanaryFlapping + expr: changes(blackbox_canary_status_gauge{service=~"cinder"}[2h]) > 8 + labels: + severity: warning + tier: openstack + service: '{{ $labels.service }}' + context: '{{ $labels.service }}' + dashboard: ccloud-health-canary-details + meta: 'Canary {{ $labels.service }} {{ $labels.check }} is flapping for 2 hours. See Sentry for details' + sentry: 'blackbox/?query=test_{{ $labels.check }}' + playbook: 'docs/devops/alert/{{ $labels.service }}' + annotations: + description: 'Canary {{ $labels.service }} {{ $labels.check }} is flapping for 2 hours. See Sentry for details' + summary: 'Canary {{ $labels.service }} {{ $labels.check }} is flapping' \ No newline at end of file diff --git a/system/kube-monitoring/charts/prometheus-frontend/openstack-designate.alerts b/system/kube-monitoring/charts/prometheus-frontend/openstack-designate.alerts index c729a45008..006a5dcbe3 100644 --- a/system/kube-monitoring/charts/prometheus-frontend/openstack-designate.alerts +++ b/system/kube-monitoring/charts/prometheus-frontend/openstack-designate.alerts @@ -121,3 +121,49 @@ groups: description: 'Datapath {{ $labels.service }} {{ $labels.check }} is flapping for 30 minutes. See Sentry for details' summary: 'Datapath {{ $labels.service }} {{ $labels.check }} is flapping' + - alert: OpenstackDesignateCanaryDown + expr: blackbox_canary_status_gauge{service=~"designate"} == 1 + for: 1h + labels: + severity: warning + tier: openstack + service: '{{ $labels.service }}' + context: '{{ $labels.service }}' + dashboard: ccloud-health-canary-details + meta: 'Canary {{ $labels.service }} {{ $labels.check }} is down for 1 hour. See Sentry for details' + sentry: 'blackbox/?query=test_{{ $labels.check }}' + playbook: 'docs/devops/alert/{{ $labels.service }}' + annotations: + description: 'Canary {{ $labels.service }} {{ $labels.check }} is down for 1 hour. See Sentry for details' + summary: 'Canary {{ $labels.service }} {{ $labels.check }} is down' + + - alert: OpenstackDesignateCanaryTimeout + expr: blackbox_canary_status_gauge{service=~"designate"} == 0.5 + for: 1h + labels: + severity: info + tier: openstack + service: '{{ $labels.service }}' + context: '{{ $labels.service }}' + dashboard: ccloud-health-canary-details + meta: 'Canary {{ $labels.service }} {{ $labels.check }} is timing out for 1 hour. See Sentry for details' + sentry: 'blackbox/?query=test_{{ $labels.check }}' + playbook: 'docs/devops/alert/{{ $labels.service }}' + annotations: + description: 'Canary {{ $labels.service }} {{ $labels.check }} is timing out for 1 hour. See Sentry for details' + summary: 'Canary {{ $labels.service }} {{ $labels.check }} is timing out' + + - alert: OpenstackDesignateCanaryFlapping + expr: changes(blackbox_canary_status_gauge{service=~"designate"}[2h]) > 8 + labels: + severity: warning + tier: openstack + service: '{{ $labels.service }}' + context: '{{ $labels.service }}' + dashboard: ccloud-health-canary-details + meta: 'Canary {{ $labels.service }} {{ $labels.check }} is flapping for 2 hours. See Sentry for details' + sentry: 'blackbox/?query=test_{{ $labels.check }}' + playbook: 'docs/devops/alert/{{ $labels.service }}' + annotations: + description: 'Canary {{ $labels.service }} {{ $labels.check }} is flapping for 2 hours. See Sentry for details' + summary: 'Canary {{ $labels.service }} {{ $labels.check }} is flapping' diff --git a/system/kube-monitoring/charts/prometheus-frontend/openstack-hermes.alerts b/system/kube-monitoring/charts/prometheus-frontend/openstack-hermes.alerts index c8dc60d293..8bd1b8e163 100644 --- a/system/kube-monitoring/charts/prometheus-frontend/openstack-hermes.alerts +++ b/system/kube-monitoring/charts/prometheus-frontend/openstack-hermes.alerts @@ -56,4 +56,51 @@ groups: playbook: 'docs/devops/alert/{{ $labels.service }}/#{{ $labels.check }}' annotations: description: '{{ $labels.check }} API is flapping for 30 minutes.' - summary: '{{ $labels.check }} API flapping' \ No newline at end of file + summary: '{{ $labels.check }} API flapping' + + - alert: OpenstackHermesCanaryDown + expr: blackbox_canary_status_gauge{service=~"hermes"} == 1 + for: 1h + labels: + severity: warning + tier: openstack + service: '{{ $labels.service }}' + context: '{{ $labels.service }}' + dashboard: ccloud-health-canary-details + meta: 'Canary {{ $labels.service }} {{ $labels.check }} is down for 1 hour. See Sentry for details' + sentry: 'blackbox/?query=test_{{ $labels.check }}' + playbook: 'docs/devops/alert/{{ $labels.service }}' + annotations: + description: 'Canary {{ $labels.service }} {{ $labels.check }} is down for 1 hour. See Sentry for details' + summary: 'Canary {{ $labels.service }} {{ $labels.check }} is down' + + - alert: OpenstackHermesCanaryTimeout + expr: blackbox_canary_status_gauge{service=~"hermes"} == 0.5 + for: 1h + labels: + severity: info + tier: openstack + service: '{{ $labels.service }}' + context: '{{ $labels.service }}' + dashboard: ccloud-health-canary-details + meta: 'Canary {{ $labels.service }} {{ $labels.check }} is timing out for 1 hour. See Sentry for details' + sentry: 'blackbox/?query=test_{{ $labels.check }}' + playbook: 'docs/devops/alert/{{ $labels.service }}' + annotations: + description: 'Canary {{ $labels.service }} {{ $labels.check }} is timing out for 1 hour. See Sentry for details' + summary: 'Canary {{ $labels.service }} {{ $labels.check }} is timing out' + + - alert: OpenstackHermesCanaryFlapping + expr: changes(blackbox_canary_status_gauge{service=~"hermes"}[2h]) > 8 + labels: + severity: warning + tier: openstack + service: '{{ $labels.service }}' + context: '{{ $labels.service }}' + dashboard: ccloud-health-canary-details + meta: 'Canary {{ $labels.service }} {{ $labels.check }} is flapping for 2 hours. See Sentry for details' + sentry: 'blackbox/?query=test_{{ $labels.check }}' + playbook: 'docs/devops/alert/{{ $labels.service }}' + annotations: + description: 'Canary {{ $labels.service }} {{ $labels.check }} is flapping for 2 hours. See Sentry for details' + summary: 'Canary {{ $labels.service }} {{ $labels.check }} is flapping' \ No newline at end of file diff --git a/system/kube-monitoring/charts/prometheus-frontend/openstack-lbaas.alerts b/system/kube-monitoring/charts/prometheus-frontend/openstack-lbaas.alerts index de15e99312..ee3225d316 100644 --- a/system/kube-monitoring/charts/prometheus-frontend/openstack-lbaas.alerts +++ b/system/kube-monitoring/charts/prometheus-frontend/openstack-lbaas.alerts @@ -77,4 +77,51 @@ groups: playbook: 'docs/devops/alert/{{ $labels.service }}/#{{ $labels.check }}' annotations: description: 'Datapath {{ $labels.service }} {{ $labels.check }} is flapping for 30 minutes. See Sentry for details' - summary: 'Datapath {{ $labels.service }} {{ $labels.check }} is flapping' \ No newline at end of file + summary: 'Datapath {{ $labels.service }} {{ $labels.check }} is flapping' + + - alert: OpenstackLbaasCanaryDown + expr: blackbox_canary_status_gauge{service=~"lbaas"} == 1 + for: 1h + labels: + severity: warning + tier: openstack + service: '{{ $labels.service }}' + context: '{{ $labels.service }}' + dashboard: ccloud-health-canary-details + meta: 'Canary {{ $labels.service }} {{ $labels.check }} is down for 1 hour. See Sentry for details' + sentry: 'blackbox/?query=test_{{ $labels.check }}' + playbook: 'docs/devops/alert/{{ $labels.service }}' + annotations: + description: 'Canary {{ $labels.service }} {{ $labels.check }} is down for 1 hour. See Sentry for details' + summary: 'Canary {{ $labels.service }} {{ $labels.check }} is down' + + - alert: OpenstackLbaasCanaryTimeout + expr: blackbox_canary_status_gauge{service=~"lbaas"} == 0.5 + for: 1h + labels: + severity: info + tier: openstack + service: '{{ $labels.service }}' + context: '{{ $labels.service }}' + dashboard: ccloud-health-canary-details + meta: 'Canary {{ $labels.service }} {{ $labels.check }} is timing out for 1 hour. See Sentry for details' + sentry: 'blackbox/?query=test_{{ $labels.check }}' + playbook: 'docs/devops/alert/{{ $labels.service }}' + annotations: + description: 'Canary {{ $labels.service }} {{ $labels.check }} is timing out for 1 hour. See Sentry for details' + summary: 'Canary {{ $labels.service }} {{ $labels.check }} is timing out' + + - alert: OpenstackLbaasCanaryFlapping + expr: changes(blackbox_canary_status_gauge{service=~"lbaas"}[2h]) > 8 + labels: + severity: warning + tier: openstack + service: '{{ $labels.service }}' + context: '{{ $labels.service }}' + dashboard: ccloud-health-canary-details + meta: 'Canary {{ $labels.service }} {{ $labels.check }} is flapping for 2 hours. See Sentry for details' + sentry: 'blackbox/?query=test_{{ $labels.check }}' + playbook: 'docs/devops/alert/{{ $labels.service }}' + annotations: + description: 'Canary {{ $labels.service }} {{ $labels.check }} is flapping for 2 hours. See Sentry for details' + summary: 'Canary {{ $labels.service }} {{ $labels.check }} is flapping' diff --git a/system/kube-monitoring/charts/prometheus-frontend/openstack-manila.alerts b/system/kube-monitoring/charts/prometheus-frontend/openstack-manila.alerts index c80ce256ce..9ac5f5c6fa 100644 --- a/system/kube-monitoring/charts/prometheus-frontend/openstack-manila.alerts +++ b/system/kube-monitoring/charts/prometheus-frontend/openstack-manila.alerts @@ -134,4 +134,52 @@ groups: playbook: 'docs/devops/alert/{{ $labels.service }}/#{{ $labels.check }}' annotations: description: 'Datapath {{ $labels.service }} {{ $labels.check }} is flapping for 30 minutes. See Sentry for details' - summary: 'Datapath {{ $labels.service }} {{ $labels.check }} is flapping' \ No newline at end of file + summary: 'Datapath {{ $labels.service }} {{ $labels.check }} is flapping' + + - alert: OpenstackManilaCanaryDown + expr: blackbox_canary_status_gauge{service=~"manila"} == 1 + for: 1h + labels: + severity: warning + tier: openstack + service: '{{ $labels.service }}' + context: '{{ $labels.service }}' + dashboard: ccloud-health-canary-details + meta: 'Canary {{ $labels.service }} {{ $labels.check }} is down for 1 hour. See Sentry for details' + sentry: 'blackbox/?query=test_{{ $labels.check }}' + playbook: 'docs/devops/alert/{{ $labels.service }}' + annotations: + description: 'Canary {{ $labels.service }} {{ $labels.check }} is down for 1 hour. See Sentry for details' + summary: 'Canary {{ $labels.service }} {{ $labels.check }} is down' + + - alert: OpenstackManilaCanaryTimeout + expr: blackbox_canary_status_gauge{service=~"manila"} == 0.5 + for: 1h + labels: + severity: info + tier: openstack + service: '{{ $labels.service }}' + context: '{{ $labels.service }}' + dashboard: ccloud-health-canary-details + meta: 'Canary {{ $labels.service }} {{ $labels.check }} is timing out for 1 hour. See Sentry for details' + sentry: 'blackbox/?query=test_{{ $labels.check }}' + playbook: 'docs/devops/alert/{{ $labels.service }}' + annotations: + description: 'Canary {{ $labels.service }} {{ $labels.check }} is timing out for 1 hour. See Sentry for details' + summary: 'Canary {{ $labels.service }} {{ $labels.check }} is timing out' + + - alert: OpenstackManilaCanaryFlapping + expr: changes(blackbox_canary_status_gauge{service=~"manila"}[2h]) > 8 + labels: + severity: warning + tier: openstack + service: '{{ $labels.service }}' + context: '{{ $labels.service }}' + dashboard: ccloud-health-canary-details + meta: 'Canary {{ $labels.service }} {{ $labels.check }} is flapping for 2 hours. See Sentry for details' + sentry: 'blackbox/?query=test_{{ $labels.check }}' + playbook: 'docs/devops/alert/{{ $labels.service }}' + annotations: + description: 'Canary {{ $labels.service }} {{ $labels.check }} is flapping for 2 hours. See Sentry for details' + summary: 'Canary {{ $labels.service }} {{ $labels.check }} is flapping' + diff --git a/system/kube-monitoring/charts/prometheus-frontend/openstack-swift.alerts b/system/kube-monitoring/charts/prometheus-frontend/openstack-swift.alerts index 1e92a2cf3b..8d15712a90 100644 --- a/system/kube-monitoring/charts/prometheus-frontend/openstack-swift.alerts +++ b/system/kube-monitoring/charts/prometheus-frontend/openstack-swift.alerts @@ -245,4 +245,51 @@ groups: playbook: 'docs/devops/alert/{{ $labels.service }}/#{{ $labels.check }}' annotations: description: '{{ $labels.check }} API is flapping for 30 minutes.' - summary: '{{ $labels.check }} API flapping' \ No newline at end of file + summary: '{{ $labels.check }} API flapping' + + - alert: OpenstackSwiftCanaryDown + expr: blackbox_canary_status_gauge{service=~"swift"} == 1 + for: 1h + labels: + severity: warning + tier: openstack + service: '{{ $labels.service }}' + context: '{{ $labels.service }}' + dashboard: ccloud-health-canary-details + meta: 'Canary {{ $labels.service }} {{ $labels.check }} is down for 1 hour. See Sentry for details' + sentry: 'blackbox/?query=test_{{ $labels.check }}' + playbook: 'docs/devops/alert/{{ $labels.service }}' + annotations: + description: 'Canary {{ $labels.service }} {{ $labels.check }} is down for 1 hour. See Sentry for details' + summary: 'Canary {{ $labels.service }} {{ $labels.check }} is down' + + - alert: OpenstackSwiftCanaryTimeout + expr: blackbox_canary_status_gauge{service=~"swift"} == 0.5 + for: 1h + labels: + severity: info + tier: openstack + service: '{{ $labels.service }}' + context: '{{ $labels.service }}' + dashboard: ccloud-health-canary-details + meta: 'Canary {{ $labels.service }} {{ $labels.check }} is timing out for 1 hour. See Sentry for details' + sentry: 'blackbox/?query=test_{{ $labels.check }}' + playbook: 'docs/devops/alert/{{ $labels.service }}' + annotations: + description: 'Canary {{ $labels.service }} {{ $labels.check }} is timing out for 1 hour. See Sentry for details' + summary: 'Canary {{ $labels.service }} {{ $labels.check }} is timing out' + + - alert: OpenstackSwiftCanaryFlapping + expr: changes(blackbox_canary_status_gauge{service=~"swift"}[2h]) > 8 + labels: + severity: warning + tier: openstack + service: '{{ $labels.service }}' + context: '{{ $labels.service }}' + dashboard: ccloud-health-canary-details + meta: 'Canary {{ $labels.service }} {{ $labels.check }} is flapping for 2 hours. See Sentry for details' + sentry: 'blackbox/?query=test_{{ $labels.check }}' + playbook: 'docs/devops/alert/{{ $labels.service }}' + annotations: + description: 'Canary {{ $labels.service }} {{ $labels.check }} is flapping for 2 hours. See Sentry for details' + summary: 'Canary {{ $labels.service }} {{ $labels.check }} is flapping'