From a427b6ebf7381d4a290c1aeb299792cd2dd8277a Mon Sep 17 00:00:00 2001 From: wejdross Date: Tue, 17 Dec 2024 11:41:04 +0100 Subject: [PATCH] fixing vshn_pg and redis --- component/vshn_postgres.jsonnet | 2 +- component/vshn_redis.jsonnet | 2 +- .../appcat/22_prom_rule_sla_postgres.yaml | 15 -- .../appcat/appcat/22_prom_rule_sla_redis.yaml | 15 -- .../sli_exporter/70_slo_vshn_postgresql.yaml | 193 ------------------ .../sli_exporter/70_slo_vshn_redis.yaml | 193 ------------------ .../80_slo_vshn_postgresql_ha.yaml | 193 ------------------ .../sli_exporter/80_slo_vshn_redis_ha.yaml | 193 ------------------ .../90_VSHNPostgreSQL_Opsgenie.yaml | 45 ---- .../sli_exporter/90_VSHNRedis_Opsgenie.yaml | 45 ---- 10 files changed, 2 insertions(+), 894 deletions(-) delete mode 100644 tests/golden/vshn-cloud/appcat/appcat/22_prom_rule_sla_postgres.yaml delete mode 100644 tests/golden/vshn-cloud/appcat/appcat/22_prom_rule_sla_redis.yaml delete mode 100644 tests/golden/vshn-cloud/appcat/appcat/sli_exporter/70_slo_vshn_postgresql.yaml delete mode 100644 tests/golden/vshn-cloud/appcat/appcat/sli_exporter/70_slo_vshn_redis.yaml delete mode 100644 tests/golden/vshn-cloud/appcat/appcat/sli_exporter/80_slo_vshn_postgresql_ha.yaml delete mode 100644 tests/golden/vshn-cloud/appcat/appcat/sli_exporter/80_slo_vshn_redis_ha.yaml delete mode 100644 tests/golden/vshn-cloud/appcat/appcat/sli_exporter/90_VSHNPostgreSQL_Opsgenie.yaml delete mode 100644 tests/golden/vshn-cloud/appcat/appcat/sli_exporter/90_VSHNRedis_Opsgenie.yaml diff --git a/component/vshn_postgres.jsonnet b/component/vshn_postgres.jsonnet index 4ba065580..42364db2b 100644 --- a/component/vshn_postgres.jsonnet +++ b/component/vshn_postgres.jsonnet @@ -315,7 +315,7 @@ local plansCM = kube.ConfigMap('vshnpostgresqlplans') + { [if isOpenshift then '11_stackgres_openshift_operator']: std.prune(stackgresOperator), [if isOpenshift then '12_stackgres_openshift_operator_netpol']: stackgresNetworkPolicy, } else {}) -+ if vars.isSingleOrServiceCluster then { ++ if vars.isSingleOrServiceCluster && inv.parameters.facts.cloud != 'exoscale' then { '22_prom_rule_sla_postgres': promRulePostgresSLA, [if params.slos.enabled && params.services.vshn.enabled && params.services.vshn.postgres.enabled then 'sli_exporter/70_slo_vshn_postgresql']: slos.Get('vshn-postgresql'), [if params.slos.enabled && params.services.vshn.enabled && params.services.vshn.postgres.enabled then 'sli_exporter/80_slo_vshn_postgresql_ha']: slos.Get('vshn-postgresql-ha'), diff --git a/component/vshn_redis.jsonnet b/component/vshn_redis.jsonnet index 0b6c322c8..0fdeeaf25 100644 --- a/component/vshn_redis.jsonnet +++ b/component/vshn_redis.jsonnet @@ -572,7 +572,7 @@ local plansCM = kube.ConfigMap('vshnredisplans') + { '21_composition_vshn_redis': composition, [if isOpenshift then '21_openshift_template_redis_vshn']: osTemplate, } else {}) -+ if vars.isSingleOrServiceCluster then { ++ if vars.isSingleOrServiceCluster && inv.parameters.facts.cloud != 'exoscale' then { '22_prom_rule_sla_redis': promRuleRedisSLA, [if params.services.vshn.enabled && params.services.vshn.redis.enabled then 'sli_exporter/70_slo_vshn_redis']: slos.Get('vshn-redis'), [if params.services.vshn.enabled && params.services.vshn.redis.enabled then 'sli_exporter/80_slo_vshn_redis_ha']: slos.Get('vshn-redis-ha'), diff --git a/tests/golden/vshn-cloud/appcat/appcat/22_prom_rule_sla_postgres.yaml b/tests/golden/vshn-cloud/appcat/appcat/22_prom_rule_sla_postgres.yaml deleted file mode 100644 index 47d209c86..000000000 --- a/tests/golden/vshn-cloud/appcat/appcat/22_prom_rule_sla_postgres.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - name: vshn-vshnpostgresql-sla - name: vshn-vshnpostgresql-sla - namespace: appcat-slos -spec: - groups: - - name: appcat-vshnpostgresql-sla-target - rules: - - expr: vector(99.25) - labels: - service: VSHNPostgreSQL - record: sla:objective:ratio diff --git a/tests/golden/vshn-cloud/appcat/appcat/22_prom_rule_sla_redis.yaml b/tests/golden/vshn-cloud/appcat/appcat/22_prom_rule_sla_redis.yaml deleted file mode 100644 index f70cb06b8..000000000 --- a/tests/golden/vshn-cloud/appcat/appcat/22_prom_rule_sla_redis.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - name: vshn-vshnredis-sla - name: vshn-vshnredis-sla - namespace: appcat-slos -spec: - groups: - - name: appcat-vshnredis-sla-target - rules: - - expr: vector(99.25) - labels: - service: VSHNRedis - record: sla:objective:ratio diff --git a/tests/golden/vshn-cloud/appcat/appcat/sli_exporter/70_slo_vshn_postgresql.yaml b/tests/golden/vshn-cloud/appcat/appcat/sli_exporter/70_slo_vshn_postgresql.yaml deleted file mode 100644 index e9edf19d2..000000000 --- a/tests/golden/vshn-cloud/appcat/appcat/sli_exporter/70_slo_vshn_postgresql.yaml +++ /dev/null @@ -1,193 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - name: vshn-postgresql - name: vshn-postgresql - namespace: appcat-slos -spec: - groups: - - name: sloth-slo-sli-recordings-appcat-vshn-postgresql-uptime - rules: - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false", maintenance="false"}[5m]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[5m])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[5m])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-postgresql-uptime - sloth_service: appcat-vshn-postgresql - sloth_slo: uptime - sloth_window: 5m - record: slo:sli_error:ratio_rate5m - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false", maintenance="false"}[30m]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[30m])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[30m])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-postgresql-uptime - sloth_service: appcat-vshn-postgresql - sloth_slo: uptime - sloth_window: 30m - record: slo:sli_error:ratio_rate30m - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false", maintenance="false"}[1h]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[1h])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[1h])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-postgresql-uptime - sloth_service: appcat-vshn-postgresql - sloth_slo: uptime - sloth_window: 1h - record: slo:sli_error:ratio_rate1h - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false", maintenance="false"}[2h]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[2h])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[2h])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-postgresql-uptime - sloth_service: appcat-vshn-postgresql - sloth_slo: uptime - sloth_window: 2h - record: slo:sli_error:ratio_rate2h - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false", maintenance="false"}[6h]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[6h])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[6h])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-postgresql-uptime - sloth_service: appcat-vshn-postgresql - sloth_slo: uptime - sloth_window: 6h - record: slo:sli_error:ratio_rate6h - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false", maintenance="false"}[1d]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[1d])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[1d])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-postgresql-uptime - sloth_service: appcat-vshn-postgresql - sloth_slo: uptime - sloth_window: 1d - record: slo:sli_error:ratio_rate1d - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="false", maintenance="false"}[3d]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[3d])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="false"}[3d])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-postgresql-uptime - sloth_service: appcat-vshn-postgresql - sloth_slo: uptime - sloth_window: 3d - record: slo:sli_error:ratio_rate3d - - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-postgresql-uptime", sloth_service="appcat-vshn-postgresql", sloth_slo="uptime"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-postgresql-uptime", sloth_service="appcat-vshn-postgresql", sloth_slo="uptime"}[30d]) - labels: - sloth_id: appcat-vshn-postgresql-uptime - sloth_service: appcat-vshn-postgresql - sloth_slo: uptime - sloth_window: 30d - record: slo:sli_error:ratio_rate30d - - name: sloth-slo-meta-recordings-appcat-vshn-postgresql-uptime - rules: - - expr: vector(0.9990000000000001) - labels: - sloth_id: appcat-vshn-postgresql-uptime - sloth_service: appcat-vshn-postgresql - sloth_slo: uptime - record: slo:objective:ratio - - expr: vector(1-0.9990000000000001) - labels: - sloth_id: appcat-vshn-postgresql-uptime - sloth_service: appcat-vshn-postgresql - sloth_slo: uptime - record: slo:error_budget:ratio - - expr: vector(30) - labels: - sloth_id: appcat-vshn-postgresql-uptime - sloth_service: appcat-vshn-postgresql - sloth_slo: uptime - record: slo:time_period:days - - expr: | - slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-postgresql-uptime", sloth_service="appcat-vshn-postgresql", sloth_slo="uptime"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="appcat-vshn-postgresql-uptime", sloth_service="appcat-vshn-postgresql", sloth_slo="uptime"} - labels: - sloth_id: appcat-vshn-postgresql-uptime - sloth_service: appcat-vshn-postgresql - sloth_slo: uptime - record: slo:current_burn_rate:ratio - - expr: | - slo:sli_error:ratio_rate30d{sloth_id="appcat-vshn-postgresql-uptime", sloth_service="appcat-vshn-postgresql", sloth_slo="uptime"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="appcat-vshn-postgresql-uptime", sloth_service="appcat-vshn-postgresql", sloth_slo="uptime"} - labels: - sloth_id: appcat-vshn-postgresql-uptime - sloth_service: appcat-vshn-postgresql - sloth_slo: uptime - record: slo:period_burn_rate:ratio - - expr: 1 - slo:period_burn_rate:ratio{sloth_id="appcat-vshn-postgresql-uptime", - sloth_service="appcat-vshn-postgresql", sloth_slo="uptime"} - labels: - sloth_id: appcat-vshn-postgresql-uptime - sloth_service: appcat-vshn-postgresql - sloth_slo: uptime - record: slo:period_error_budget_remaining:ratio - - expr: vector(1) - labels: - sloth_id: appcat-vshn-postgresql-uptime - sloth_mode: cli-gen-prom - sloth_objective: '99.9' - sloth_service: appcat-vshn-postgresql - sloth_slo: uptime - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 - record: sloth_slo_info - - name: sloth-slo-alerts-appcat-vshn-postgresql-uptime - rules: - - alert: SLO_AppCat_VSHNPostgreSQLUptime - annotations: - for: 6m - summary: Probes to PostgreSQL by VSHN instance fail - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error - budget burn rate is too fast. - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-postgresql-uptime", sloth_service="appcat-vshn-postgresql", sloth_slo="uptime"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="appcat-vshn-postgresql-uptime", sloth_service="appcat-vshn-postgresql", sloth_slo="uptime"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="appcat-vshn-postgresql-uptime", sloth_service="appcat-vshn-postgresql", sloth_slo="uptime"} > (6 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="appcat-vshn-postgresql-uptime", sloth_service="appcat-vshn-postgresql", sloth_slo="uptime"} > (6 * 0.0009999999999999432)) without (sloth_window) - ) - for: 6m - labels: - service: VSHNPostgreSQL - severity: critical - sloth_severity: page - - alert: SLO_AppCat_VSHNPostgreSQLUptime - annotations: - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#uptime - summary: Probes to PostgreSQL by VSHN instance fail - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error - budget burn rate is too fast. - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="appcat-vshn-postgresql-uptime", sloth_service="appcat-vshn-postgresql", sloth_slo="uptime"} > (3 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="appcat-vshn-postgresql-uptime", sloth_service="appcat-vshn-postgresql", sloth_slo="uptime"} > (3 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="appcat-vshn-postgresql-uptime", sloth_service="appcat-vshn-postgresql", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="appcat-vshn-postgresql-uptime", sloth_service="appcat-vshn-postgresql", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - service: VSHNPostgreSQL - severity: warning - sloth_severity: ticket diff --git a/tests/golden/vshn-cloud/appcat/appcat/sli_exporter/70_slo_vshn_redis.yaml b/tests/golden/vshn-cloud/appcat/appcat/sli_exporter/70_slo_vshn_redis.yaml deleted file mode 100644 index 7eaeee282..000000000 --- a/tests/golden/vshn-cloud/appcat/appcat/sli_exporter/70_slo_vshn_redis.yaml +++ /dev/null @@ -1,193 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - name: vshn-redis - name: vshn-redis - namespace: appcat-slos -spec: - groups: - - name: sloth-slo-sli-recordings-appcat-vshn-redis-uptime - rules: - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false", maintenance="false"}[5m]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[5m])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="false"}[5m])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-redis-uptime - sloth_service: appcat-vshn-redis - sloth_slo: uptime - sloth_window: 5m - record: slo:sli_error:ratio_rate5m - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false", maintenance="false"}[30m]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[30m])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="false"}[30m])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-redis-uptime - sloth_service: appcat-vshn-redis - sloth_slo: uptime - sloth_window: 30m - record: slo:sli_error:ratio_rate30m - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false", maintenance="false"}[1h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1h])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="false"}[1h])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-redis-uptime - sloth_service: appcat-vshn-redis - sloth_slo: uptime - sloth_window: 1h - record: slo:sli_error:ratio_rate1h - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false", maintenance="false"}[2h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[2h])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="false"}[2h])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-redis-uptime - sloth_service: appcat-vshn-redis - sloth_slo: uptime - sloth_window: 2h - record: slo:sli_error:ratio_rate2h - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false", maintenance="false"}[6h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[6h])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="false"}[6h])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-redis-uptime - sloth_service: appcat-vshn-redis - sloth_slo: uptime - sloth_window: 6h - record: slo:sli_error:ratio_rate6h - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false", maintenance="false"}[1d]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1d])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="false"}[1d])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-redis-uptime - sloth_service: appcat-vshn-redis - sloth_slo: uptime - sloth_window: 1d - record: slo:sli_error:ratio_rate1d - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="false", maintenance="false"}[3d]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[3d])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="false"}[3d])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-redis-uptime - sloth_service: appcat-vshn-redis - sloth_slo: uptime - sloth_window: 3d - record: slo:sli_error:ratio_rate3d - - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-redis-uptime", sloth_service="appcat-vshn-redis", sloth_slo="uptime"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-redis-uptime", sloth_service="appcat-vshn-redis", sloth_slo="uptime"}[30d]) - labels: - sloth_id: appcat-vshn-redis-uptime - sloth_service: appcat-vshn-redis - sloth_slo: uptime - sloth_window: 30d - record: slo:sli_error:ratio_rate30d - - name: sloth-slo-meta-recordings-appcat-vshn-redis-uptime - rules: - - expr: vector(0.9990000000000001) - labels: - sloth_id: appcat-vshn-redis-uptime - sloth_service: appcat-vshn-redis - sloth_slo: uptime - record: slo:objective:ratio - - expr: vector(1-0.9990000000000001) - labels: - sloth_id: appcat-vshn-redis-uptime - sloth_service: appcat-vshn-redis - sloth_slo: uptime - record: slo:error_budget:ratio - - expr: vector(30) - labels: - sloth_id: appcat-vshn-redis-uptime - sloth_service: appcat-vshn-redis - sloth_slo: uptime - record: slo:time_period:days - - expr: | - slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-redis-uptime", sloth_service="appcat-vshn-redis", sloth_slo="uptime"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="appcat-vshn-redis-uptime", sloth_service="appcat-vshn-redis", sloth_slo="uptime"} - labels: - sloth_id: appcat-vshn-redis-uptime - sloth_service: appcat-vshn-redis - sloth_slo: uptime - record: slo:current_burn_rate:ratio - - expr: | - slo:sli_error:ratio_rate30d{sloth_id="appcat-vshn-redis-uptime", sloth_service="appcat-vshn-redis", sloth_slo="uptime"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="appcat-vshn-redis-uptime", sloth_service="appcat-vshn-redis", sloth_slo="uptime"} - labels: - sloth_id: appcat-vshn-redis-uptime - sloth_service: appcat-vshn-redis - sloth_slo: uptime - record: slo:period_burn_rate:ratio - - expr: 1 - slo:period_burn_rate:ratio{sloth_id="appcat-vshn-redis-uptime", - sloth_service="appcat-vshn-redis", sloth_slo="uptime"} - labels: - sloth_id: appcat-vshn-redis-uptime - sloth_service: appcat-vshn-redis - sloth_slo: uptime - record: slo:period_error_budget_remaining:ratio - - expr: vector(1) - labels: - sloth_id: appcat-vshn-redis-uptime - sloth_mode: cli-gen-prom - sloth_objective: '99.9' - sloth_service: appcat-vshn-redis - sloth_slo: uptime - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 - record: sloth_slo_info - - name: sloth-slo-alerts-appcat-vshn-redis-uptime - rules: - - alert: SLO_AppCat_VSHNRedisUptime - annotations: - for: 6m - summary: Probes to Redis by VSHN instance fail - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error - budget burn rate is too fast. - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-redis-uptime", sloth_service="appcat-vshn-redis", sloth_slo="uptime"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="appcat-vshn-redis-uptime", sloth_service="appcat-vshn-redis", sloth_slo="uptime"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="appcat-vshn-redis-uptime", sloth_service="appcat-vshn-redis", sloth_slo="uptime"} > (6 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="appcat-vshn-redis-uptime", sloth_service="appcat-vshn-redis", sloth_slo="uptime"} > (6 * 0.0009999999999999432)) without (sloth_window) - ) - for: 6m - labels: - service: VSHNRedis - severity: critical - sloth_severity: page - - alert: SLO_AppCat_VSHNRedisUptime - annotations: - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-redis.html#uptime - summary: Probes to Redis by VSHN instance fail - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error - budget burn rate is too fast. - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="appcat-vshn-redis-uptime", sloth_service="appcat-vshn-redis", sloth_slo="uptime"} > (3 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="appcat-vshn-redis-uptime", sloth_service="appcat-vshn-redis", sloth_slo="uptime"} > (3 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="appcat-vshn-redis-uptime", sloth_service="appcat-vshn-redis", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="appcat-vshn-redis-uptime", sloth_service="appcat-vshn-redis", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - service: VSHNRedis - severity: warning - sloth_severity: ticket diff --git a/tests/golden/vshn-cloud/appcat/appcat/sli_exporter/80_slo_vshn_postgresql_ha.yaml b/tests/golden/vshn-cloud/appcat/appcat/sli_exporter/80_slo_vshn_postgresql_ha.yaml deleted file mode 100644 index 9bb4eb2e6..000000000 --- a/tests/golden/vshn-cloud/appcat/appcat/sli_exporter/80_slo_vshn_postgresql_ha.yaml +++ /dev/null @@ -1,193 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - name: vshn-postgresql-ha - name: vshn-postgresql-ha - namespace: appcat-slos -spec: - groups: - - name: sloth-slo-sli-recordings-appcat-vshn-postgresql-ha-uptime - rules: - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[5m]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[5m])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[5m])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-postgresql-ha-uptime - sloth_service: appcat-vshn-postgresql-ha - sloth_slo: uptime - sloth_window: 5m - record: slo:sli_error:ratio_rate5m - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[30m]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[30m])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[30m])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-postgresql-ha-uptime - sloth_service: appcat-vshn-postgresql-ha - sloth_slo: uptime - sloth_window: 30m - record: slo:sli_error:ratio_rate30m - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[1h]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[1h])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[1h])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-postgresql-ha-uptime - sloth_service: appcat-vshn-postgresql-ha - sloth_slo: uptime - sloth_window: 1h - record: slo:sli_error:ratio_rate1h - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[2h]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[2h])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[2h])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-postgresql-ha-uptime - sloth_service: appcat-vshn-postgresql-ha - sloth_slo: uptime - sloth_window: 2h - record: slo:sli_error:ratio_rate2h - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[6h]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[6h])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[6h])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-postgresql-ha-uptime - sloth_service: appcat-vshn-postgresql-ha - sloth_slo: uptime - sloth_window: 6h - record: slo:sli_error:ratio_rate6h - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[1d]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[1d])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[1d])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-postgresql-ha-uptime - sloth_service: appcat-vshn-postgresql-ha - sloth_slo: uptime - sloth_window: 1d - record: slo:sli_error:ratio_rate1d - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", ha="true"}[3d]) or 0*rate(appcat_probes_seconds_count{service="VSHNPostgreSQL"}[3d])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNPostgreSQL", ha="true"}[3d])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-postgresql-ha-uptime - sloth_service: appcat-vshn-postgresql-ha - sloth_slo: uptime - sloth_window: 3d - record: slo:sli_error:ratio_rate3d - - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-postgresql-ha-uptime", sloth_service="appcat-vshn-postgresql-ha", sloth_slo="uptime"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-postgresql-ha-uptime", sloth_service="appcat-vshn-postgresql-ha", sloth_slo="uptime"}[30d]) - labels: - sloth_id: appcat-vshn-postgresql-ha-uptime - sloth_service: appcat-vshn-postgresql-ha - sloth_slo: uptime - sloth_window: 30d - record: slo:sli_error:ratio_rate30d - - name: sloth-slo-meta-recordings-appcat-vshn-postgresql-ha-uptime - rules: - - expr: vector(0.9990000000000001) - labels: - sloth_id: appcat-vshn-postgresql-ha-uptime - sloth_service: appcat-vshn-postgresql-ha - sloth_slo: uptime - record: slo:objective:ratio - - expr: vector(1-0.9990000000000001) - labels: - sloth_id: appcat-vshn-postgresql-ha-uptime - sloth_service: appcat-vshn-postgresql-ha - sloth_slo: uptime - record: slo:error_budget:ratio - - expr: vector(30) - labels: - sloth_id: appcat-vshn-postgresql-ha-uptime - sloth_service: appcat-vshn-postgresql-ha - sloth_slo: uptime - record: slo:time_period:days - - expr: | - slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-postgresql-ha-uptime", sloth_service="appcat-vshn-postgresql-ha", sloth_slo="uptime"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="appcat-vshn-postgresql-ha-uptime", sloth_service="appcat-vshn-postgresql-ha", sloth_slo="uptime"} - labels: - sloth_id: appcat-vshn-postgresql-ha-uptime - sloth_service: appcat-vshn-postgresql-ha - sloth_slo: uptime - record: slo:current_burn_rate:ratio - - expr: | - slo:sli_error:ratio_rate30d{sloth_id="appcat-vshn-postgresql-ha-uptime", sloth_service="appcat-vshn-postgresql-ha", sloth_slo="uptime"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="appcat-vshn-postgresql-ha-uptime", sloth_service="appcat-vshn-postgresql-ha", sloth_slo="uptime"} - labels: - sloth_id: appcat-vshn-postgresql-ha-uptime - sloth_service: appcat-vshn-postgresql-ha - sloth_slo: uptime - record: slo:period_burn_rate:ratio - - expr: 1 - slo:period_burn_rate:ratio{sloth_id="appcat-vshn-postgresql-ha-uptime", - sloth_service="appcat-vshn-postgresql-ha", sloth_slo="uptime"} - labels: - sloth_id: appcat-vshn-postgresql-ha-uptime - sloth_service: appcat-vshn-postgresql-ha - sloth_slo: uptime - record: slo:period_error_budget_remaining:ratio - - expr: vector(1) - labels: - sloth_id: appcat-vshn-postgresql-ha-uptime - sloth_mode: cli-gen-prom - sloth_objective: '99.9' - sloth_service: appcat-vshn-postgresql-ha - sloth_slo: uptime - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 - record: sloth_slo_info - - name: sloth-slo-alerts-appcat-vshn-postgresql-ha-uptime - rules: - - alert: SLO_AppCat_HAVSHNPostgreSQLUptime - annotations: - for: 6m - summary: Probes to HA PostgreSQL by VSHN instance fail - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error - budget burn rate is too fast. - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-postgresql-ha-uptime", sloth_service="appcat-vshn-postgresql-ha", sloth_slo="uptime"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="appcat-vshn-postgresql-ha-uptime", sloth_service="appcat-vshn-postgresql-ha", sloth_slo="uptime"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="appcat-vshn-postgresql-ha-uptime", sloth_service="appcat-vshn-postgresql-ha", sloth_slo="uptime"} > (6 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="appcat-vshn-postgresql-ha-uptime", sloth_service="appcat-vshn-postgresql-ha", sloth_slo="uptime"} > (6 * 0.0009999999999999432)) without (sloth_window) - ) - for: 6m - labels: - service: VSHNPostgreSQL - severity: critical - sloth_severity: page - - alert: SLO_AppCat_HAVSHNPostgreSQLUptime - annotations: - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#uptime - summary: Probes to HA PostgreSQL by VSHN instance fail - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error - budget burn rate is too fast. - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="appcat-vshn-postgresql-ha-uptime", sloth_service="appcat-vshn-postgresql-ha", sloth_slo="uptime"} > (3 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="appcat-vshn-postgresql-ha-uptime", sloth_service="appcat-vshn-postgresql-ha", sloth_slo="uptime"} > (3 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="appcat-vshn-postgresql-ha-uptime", sloth_service="appcat-vshn-postgresql-ha", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="appcat-vshn-postgresql-ha-uptime", sloth_service="appcat-vshn-postgresql-ha", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - service: VSHNPostgreSQL - severity: warning - sloth_severity: ticket diff --git a/tests/golden/vshn-cloud/appcat/appcat/sli_exporter/80_slo_vshn_redis_ha.yaml b/tests/golden/vshn-cloud/appcat/appcat/sli_exporter/80_slo_vshn_redis_ha.yaml deleted file mode 100644 index cae03dbfb..000000000 --- a/tests/golden/vshn-cloud/appcat/appcat/sli_exporter/80_slo_vshn_redis_ha.yaml +++ /dev/null @@ -1,193 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - name: vshn-redis-ha - name: vshn-redis-ha - namespace: appcat-slos -spec: - groups: - - name: sloth-slo-sli-recordings-appcat-vshn-redis-ha-uptime - rules: - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[5m]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[5m])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="true"}[5m])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-redis-ha-uptime - sloth_service: appcat-vshn-redis-ha - sloth_slo: uptime - sloth_window: 5m - record: slo:sli_error:ratio_rate5m - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[30m]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[30m])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="true"}[30m])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-redis-ha-uptime - sloth_service: appcat-vshn-redis-ha - sloth_slo: uptime - sloth_window: 30m - record: slo:sli_error:ratio_rate30m - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[1h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1h])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="true"}[1h])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-redis-ha-uptime - sloth_service: appcat-vshn-redis-ha - sloth_slo: uptime - sloth_window: 1h - record: slo:sli_error:ratio_rate1h - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[2h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[2h])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="true"}[2h])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-redis-ha-uptime - sloth_service: appcat-vshn-redis-ha - sloth_slo: uptime - sloth_window: 2h - record: slo:sli_error:ratio_rate2h - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[6h]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[6h])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="true"}[6h])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-redis-ha-uptime - sloth_service: appcat-vshn-redis-ha - sloth_slo: uptime - sloth_window: 6h - record: slo:sli_error:ratio_rate6h - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[1d]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[1d])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="true"}[1d])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-redis-ha-uptime - sloth_service: appcat-vshn-redis-ha - sloth_slo: uptime - sloth_window: 1d - record: slo:sli_error:ratio_rate1d - - expr: | - (sum(rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", ha="true"}[3d]) or 0*rate(appcat_probes_seconds_count{service="VSHNRedis"}[3d])) by (service, namespace, name, organization, sla)) - / - (sum(rate(appcat_probes_seconds_count{service="VSHNRedis", ha="true"}[3d])) by (service, namespace, name, organization, sla)) - labels: - sloth_id: appcat-vshn-redis-ha-uptime - sloth_service: appcat-vshn-redis-ha - sloth_slo: uptime - sloth_window: 3d - record: slo:sli_error:ratio_rate3d - - expr: | - sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-redis-ha-uptime", sloth_service="appcat-vshn-redis-ha", sloth_slo="uptime"}[30d]) - / ignoring (sloth_window) - count_over_time(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-redis-ha-uptime", sloth_service="appcat-vshn-redis-ha", sloth_slo="uptime"}[30d]) - labels: - sloth_id: appcat-vshn-redis-ha-uptime - sloth_service: appcat-vshn-redis-ha - sloth_slo: uptime - sloth_window: 30d - record: slo:sli_error:ratio_rate30d - - name: sloth-slo-meta-recordings-appcat-vshn-redis-ha-uptime - rules: - - expr: vector(0.9990000000000001) - labels: - sloth_id: appcat-vshn-redis-ha-uptime - sloth_service: appcat-vshn-redis-ha - sloth_slo: uptime - record: slo:objective:ratio - - expr: vector(1-0.9990000000000001) - labels: - sloth_id: appcat-vshn-redis-ha-uptime - sloth_service: appcat-vshn-redis-ha - sloth_slo: uptime - record: slo:error_budget:ratio - - expr: vector(30) - labels: - sloth_id: appcat-vshn-redis-ha-uptime - sloth_service: appcat-vshn-redis-ha - sloth_slo: uptime - record: slo:time_period:days - - expr: | - slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-redis-ha-uptime", sloth_service="appcat-vshn-redis-ha", sloth_slo="uptime"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="appcat-vshn-redis-ha-uptime", sloth_service="appcat-vshn-redis-ha", sloth_slo="uptime"} - labels: - sloth_id: appcat-vshn-redis-ha-uptime - sloth_service: appcat-vshn-redis-ha - sloth_slo: uptime - record: slo:current_burn_rate:ratio - - expr: | - slo:sli_error:ratio_rate30d{sloth_id="appcat-vshn-redis-ha-uptime", sloth_service="appcat-vshn-redis-ha", sloth_slo="uptime"} - / on(sloth_id, sloth_slo, sloth_service) group_left - slo:error_budget:ratio{sloth_id="appcat-vshn-redis-ha-uptime", sloth_service="appcat-vshn-redis-ha", sloth_slo="uptime"} - labels: - sloth_id: appcat-vshn-redis-ha-uptime - sloth_service: appcat-vshn-redis-ha - sloth_slo: uptime - record: slo:period_burn_rate:ratio - - expr: 1 - slo:period_burn_rate:ratio{sloth_id="appcat-vshn-redis-ha-uptime", - sloth_service="appcat-vshn-redis-ha", sloth_slo="uptime"} - labels: - sloth_id: appcat-vshn-redis-ha-uptime - sloth_service: appcat-vshn-redis-ha - sloth_slo: uptime - record: slo:period_error_budget_remaining:ratio - - expr: vector(1) - labels: - sloth_id: appcat-vshn-redis-ha-uptime - sloth_mode: cli-gen-prom - sloth_objective: '99.9' - sloth_service: appcat-vshn-redis-ha - sloth_slo: uptime - sloth_spec: prometheus/v1 - sloth_version: v0.11.0 - record: sloth_slo_info - - name: sloth-slo-alerts-appcat-vshn-redis-ha-uptime - rules: - - alert: SLO_AppCat_HAVSHNRedisUptime - annotations: - for: 6m - summary: Probes to HA Redis by VSHN instance fail - title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error - budget burn rate is too fast. - expr: | - ( - max(slo:sli_error:ratio_rate5m{sloth_id="appcat-vshn-redis-ha-uptime", sloth_service="appcat-vshn-redis-ha", sloth_slo="uptime"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1h{sloth_id="appcat-vshn-redis-ha-uptime", sloth_service="appcat-vshn-redis-ha", sloth_slo="uptime"} > (14.4 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate30m{sloth_id="appcat-vshn-redis-ha-uptime", sloth_service="appcat-vshn-redis-ha", sloth_slo="uptime"} > (6 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate6h{sloth_id="appcat-vshn-redis-ha-uptime", sloth_service="appcat-vshn-redis-ha", sloth_slo="uptime"} > (6 * 0.0009999999999999432)) without (sloth_window) - ) - for: 6m - labels: - service: VSHNRedis - severity: critical - sloth_severity: page - - alert: SLO_AppCat_HAVSHNRedisUptime - annotations: - runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-redis.html#uptime - summary: Probes to HA Redis by VSHN instance fail - title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error - budget burn rate is too fast. - expr: | - ( - max(slo:sli_error:ratio_rate2h{sloth_id="appcat-vshn-redis-ha-uptime", sloth_service="appcat-vshn-redis-ha", sloth_slo="uptime"} > (3 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate1d{sloth_id="appcat-vshn-redis-ha-uptime", sloth_service="appcat-vshn-redis-ha", sloth_slo="uptime"} > (3 * 0.0009999999999999432)) without (sloth_window) - ) - or - ( - max(slo:sli_error:ratio_rate6h{sloth_id="appcat-vshn-redis-ha-uptime", sloth_service="appcat-vshn-redis-ha", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window) - and - max(slo:sli_error:ratio_rate3d{sloth_id="appcat-vshn-redis-ha-uptime", sloth_service="appcat-vshn-redis-ha", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window) - ) - labels: - service: VSHNRedis - severity: warning - sloth_severity: ticket diff --git a/tests/golden/vshn-cloud/appcat/appcat/sli_exporter/90_VSHNPostgreSQL_Opsgenie.yaml b/tests/golden/vshn-cloud/appcat/appcat/sli_exporter/90_VSHNPostgreSQL_Opsgenie.yaml deleted file mode 100644 index 826bf7c73..000000000 --- a/tests/golden/vshn-cloud/appcat/appcat/sli_exporter/90_VSHNPostgreSQL_Opsgenie.yaml +++ /dev/null @@ -1,45 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - syn: 'true' - syn_component: appcat - syn_team: schedar - name: vshn-vshnpostgresql-new-sla - namespace: appcat-slos -spec: - groups: - - name: appcat-vshnpostgresql-sla-target - rules: - - alert: vshn-vshnpostgresql-new-sla - annotations: - summary: '{{$labels.service}} {{$labels.name}} down in {{$labels.namespace}}' - title: '{{$labels.service}} {{$labels.name}} down in {{$labels.namespace}}' - expr: rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", - ha="false", maintenance="false"}[5m]) > 0.2 and rate(appcat_probes_seconds_count{reason!="success", - service="VSHNPostgreSQL", ha="false", maintenance="false"}[1m]) > 0.75 - labels: - OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end - }}' - runbook: https://kb.vshn.ch/app-catalog/how-tos/appcat/GuaranteedUptimeTarget.html - service: VSHNPostgreSQL - severity: critical - syn: 'true' - syn_component: appcat - syn_team: schedar - - alert: vshn-vshnpostgresql-new-sla-ha - annotations: - summary: '{{$labels.service}} {{$labels.name}} down in {{$labels.namespace}}' - title: '{{$labels.service}} {{$labels.name}} down in {{$labels.namespace}}' - expr: rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL", - ha="true"}[5m]) > 0.2 and rate(appcat_probes_seconds_count{reason!="success", - service="VSHNPostgreSQL", ha="true"}[1m]) > 0.75 - labels: - OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end - }}' - runbook: https://kb.vshn.ch/app-catalog/how-tos/appcat/GuaranteedUptimeTarget.html - service: VSHNPostgreSQL - severity: critical - syn: 'true' - syn_component: appcat - syn_team: schedar diff --git a/tests/golden/vshn-cloud/appcat/appcat/sli_exporter/90_VSHNRedis_Opsgenie.yaml b/tests/golden/vshn-cloud/appcat/appcat/sli_exporter/90_VSHNRedis_Opsgenie.yaml deleted file mode 100644 index e5b0d0bd1..000000000 --- a/tests/golden/vshn-cloud/appcat/appcat/sli_exporter/90_VSHNRedis_Opsgenie.yaml +++ /dev/null @@ -1,45 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - syn: 'true' - syn_component: appcat - syn_team: schedar - name: vshn-vshnredis-new-sla - namespace: appcat-slos -spec: - groups: - - name: appcat-vshnredis-sla-target - rules: - - alert: vshn-vshnredis-new-sla - annotations: - summary: '{{$labels.service}} {{$labels.name}} down in {{$labels.namespace}}' - title: '{{$labels.service}} {{$labels.name}} down in {{$labels.namespace}}' - expr: rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", - ha="false", maintenance="false"}[5m]) > 0.2 and rate(appcat_probes_seconds_count{reason!="success", - service="VSHNRedis", ha="false", maintenance="false"}[1m]) > 0.75 - labels: - OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end - }}' - runbook: https://kb.vshn.ch/app-catalog/how-tos/appcat/GuaranteedUptimeTarget.html - service: VSHNRedis - severity: critical - syn: 'true' - syn_component: appcat - syn_team: schedar - - alert: vshn-vshnredis-new-sla-ha - annotations: - summary: '{{$labels.service}} {{$labels.name}} down in {{$labels.namespace}}' - title: '{{$labels.service}} {{$labels.name}} down in {{$labels.namespace}}' - expr: rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis", - ha="true"}[5m]) > 0.2 and rate(appcat_probes_seconds_count{reason!="success", - service="VSHNRedis", ha="true"}[1m]) > 0.75 - labels: - OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end - }}' - runbook: https://kb.vshn.ch/app-catalog/how-tos/appcat/GuaranteedUptimeTarget.html - service: VSHNRedis - severity: critical - syn: 'true' - syn_component: appcat - syn_team: schedar