Skip to content

Commit

Permalink
Merge pull request #487 from vshn/alerting_rework
Browse files Browse the repository at this point in the history
Sane Opsgenie alerting
  • Loading branch information
wejdross authored Sep 30, 2024
2 parents ca813ad + cde33a2 commit d08db17
Show file tree
Hide file tree
Showing 30 changed files with 387 additions and 196 deletions.
3 changes: 0 additions & 3 deletions component/slos.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ local newSLO(name, group, sloParams) =
name: name,
objective: sloParams.objective,
alerting: {
labels: params.slos.alerting.labels,
page_alert: {
labels: params.slos.alerting.page_labels,
annotations: {
Expand Down Expand Up @@ -86,7 +85,6 @@ local generateSlothInput(name, uptime) =
},
labels+: {
service: 'VSHN' + name,
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end }}',
},
},
},
Expand All @@ -104,7 +102,6 @@ local generateSlothInput(name, uptime) =
},
labels+: {
service: 'VSHN' + name,
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end }}',
},
},
},
Expand Down
63 changes: 63 additions & 0 deletions component/vshn_alerting.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
local kap = import 'lib/kapitan.libjsonnet';

local inv = kap.inventory();
local params = inv.parameters.appcat;


local genGenericAlertingRule(serviceName) = {
apiVersion: 'monitoring.coreos.com/v1',
kind: 'PrometheusRule',
metadata: {
name: 'vshn-' + std.asciiLower(serviceName) + '-sla',
namespace: params.slos.namespace,
labels: {
syn_team: 'schedar',
syn_component: 'appcat',
syn: 'true',
},
},
spec: {
groups: [
{
name: 'appcat-' + std.asciiLower(serviceName) + '-sla-target',
rules: [
{
alert: 'vshn-' + std.asciiLower(serviceName) + '-sla',
// this query can be read as: if the rate of probes that are not successful is higher than 0.2 in the last 5 minutes and in the last minute, then alert
// rate works on per second basis, so 0.2 means 20% of the probes are failing, which for 5 minutes is 1 minute and for 1 minute is 45 seconds
expr: 'rate(appcat_probes_seconds_count{reason!="success", service="' + serviceName + '", ha="false", maintenance="false"}[5m]) > 0.2 and rate(appcat_probes_seconds_count{reason!="success", service="' + serviceName + '", ha="false", maintenance="false"}[1m]) > 0.75',
labels: {
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end }}',
runbook: 'https://kb.vshn.ch/app-catalog/how-tos/appcat/GuaranteedUptimeTarget.html',
service: serviceName,
severity: 'critical',
syn: 'true',
syn_team: 'schedar',
syn_component: 'appcat',
},
},
{
alert: 'vshn-' + std.asciiLower(serviceName) + '-sla-ha',
// this query can be read as: if the rate of probes that are not successful is higher than 0.2 in the last 5 minutes and in the last minute, then alert
// rate works on per second basis, so 0.2 means 20% of the probes are failing, which for 5 minutes is 1 minute and for 1 minute is 45 seconds
expr: 'rate(appcat_probes_seconds_count{reason!="success", service="' + serviceName + '", ha="true"}[5m]) > 0.2 and rate(appcat_probes_seconds_count{reason!="success", service="' + serviceName + '", ha="true"}[1m]) > 0.75',
labels: {
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end }}',
runbook: 'https://kb.vshn.ch/app-catalog/how-tos/appcat/GuaranteedUptimeTarget.html',
service: serviceName,
severity: 'critical',
syn: 'true',
syn_team: 'schedar',
syn_component: 'appcat',
},
},
],
},
],
},
};


{
GenGenericAlertingRule: genGenericAlertingRule,
}
4 changes: 4 additions & 0 deletions component/vshn_appcat_services.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ local prom = import 'prometheus.libsonnet';
local xrds = import 'xrds.libsonnet';

local slos = import 'slos.libsonnet';
local opsgenieRules = import 'vshn_alerting.jsonnet';


local inv = kap.inventory();
local params = inv.parameters.appcat;
Expand Down Expand Up @@ -188,6 +190,8 @@ local vshn_appcat_service(name, serviceParams) =
[if isOpenshift && std.objectHas(serviceParams, 'openshiftTemplate') then '21_openshift_template_%s_vshn' % name]: osTemplate,
[if params.services.vshn.enabled && serviceParams.enabled then 'sli_exporter/90_slo_vshn_%s' % name]: slos.Get('vshn-' + name),
[if params.services.vshn.enabled && serviceParams.enabled then 'sli_exporter/90_slo_vshn_%s_ha' % name]: slos.Get('vshn-' + name + '-ha'),
['sli_exporter/90_%s_Opsgenie' % name]: opsgenieRules.GenGenericAlertingRule(name),

} else {}
;

Expand Down
4 changes: 3 additions & 1 deletion component/vshn_minio.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ local common = import 'common.libsonnet';
local inv = kap.inventory();
local params = inv.parameters.appcat;
local minioParams = params.services.vshn.minio;

local opsgenieRules = import 'vshn_alerting.jsonnet';

local instances = [
kube._Object('vshn.appcat.vshn.io/v1', 'VSHNMinio', instance.name) +
Expand All @@ -29,4 +29,6 @@ local instances = [

if params.services.vshn.enabled && minioParams.enabled && std.length(instances) != 0 then {
'22_minio_instances': instances,
'sli_exporter/90_VSHNMinio_Opsgenie': opsgenieRules.GenGenericAlertingRule('VSHNMinio'),

} else {}
3 changes: 3 additions & 0 deletions component/vshn_postgres.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ local xrds = import 'xrds.libsonnet';
local inv = kap.inventory();
local params = inv.parameters.appcat;
local pgParams = params.services.vshn.postgres;
local opsgenieRules = import 'vshn_alerting.jsonnet';

local defaultDB = 'postgres';
local defaultUser = 'postgres';
Expand Down Expand Up @@ -307,4 +308,6 @@ if params.services.vshn.enabled && pgParams.enabled then
[if isOpenshift then '12_stackgres_openshift_operator_netpol']: stackgresNetworkPolicy,
[if params.slos.enabled && params.services.vshn.enabled && params.services.vshn.postgres.enabled then 'sli_exporter/90_slo_vshn_postgresql']: slos.Get('vshn-postgresql'),
[if params.slos.enabled && params.services.vshn.enabled && params.services.vshn.postgres.enabled then 'sli_exporter/90_slo_vshn_postgresql_ha']: slos.Get('vshn-postgresql-ha'),
[if params.services.vshn.enabled && params.services.vshn.postgres.enabled then 'sli_exporter/90_VSHNPostgreSQL_Opsgenie']: opsgenieRules.GenGenericAlertingRule('VSHNPostgreSQL'),

} else {}
2 changes: 2 additions & 0 deletions component/vshn_redis.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ local xrds = import 'xrds.libsonnet';
local inv = kap.inventory();
local params = inv.parameters.appcat;
local redisParams = params.services.vshn.redis;
local opsgenieRules = import 'vshn_alerting.jsonnet';

local defaultUser = 'default';
local defaultPort = '6379';
Expand Down Expand Up @@ -566,4 +567,5 @@ if params.services.vshn.enabled && redisParams.enabled then {
[if isOpenshift then '21_openshift_template_redis_vshn']: osTemplate,
[if params.services.vshn.enabled && params.services.vshn.redis.enabled then 'sli_exporter/90_slo_vshn_redis']: slos.Get('vshn-redis'),
[if params.services.vshn.enabled && params.services.vshn.redis.enabled then 'sli_exporter/90_slo_vshn_redis_ha']: slos.Get('vshn-redis-ha'),
[if params.services.vshn.enabled && params.services.vshn.redis.enabled then 'sli_exporter/90_VSHNRedis_Opsgenie']: opsgenieRules.GenGenericAlertingRule('VSHNRedis'),
} else {}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
syn: 'true'
syn_component: appcat
syn_team: schedar
name: vshn-minio-sla
namespace: appcat-slos
spec:
groups:
- name: appcat-minio-sla-target
rules:
- alert: vshn-minio-sla
expr: rate(appcat_probes_seconds_count{reason!="success", service="minio",
ha="false", maintenance="false"}[5m]) > 0.2 and rate(appcat_probes_seconds_count{reason!="success",
service="minio", ha="false", maintenance="false"}[1m]) > 0.75
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
runbook: https://kb.vshn.ch/app-catalog/how-tos/appcat/GuaranteedUptimeTarget.html
service: minio
severity: critical
syn: 'true'
syn_component: appcat
syn_team: schedar
- alert: vshn-minio-sla-ha
expr: rate(appcat_probes_seconds_count{reason!="success", service="minio",
ha="true"}[5m]) > 0.2 and rate(appcat_probes_seconds_count{reason!="success",
service="minio", ha="true"}[1m]) > 0.75
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
runbook: https://kb.vshn.ch/app-catalog/how-tos/appcat/GuaranteedUptimeTarget.html
service: minio
severity: critical
syn: 'true'
syn_component: appcat
syn_team: schedar
Original file line number Diff line number Diff line change
Expand Up @@ -166,15 +166,9 @@ spec:
)
for: 6m
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
service: VSHNMinio
severity: critical
slo: 'true'
sloth_severity: page
syn: 'true'
syn_component: appcat
syn_team: schedar
- alert: SLO_AppCat_VSHNMinioUptime
annotations:
runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-minio.html#uptime
Expand All @@ -194,12 +188,6 @@ spec:
max(slo:sli_error:ratio_rate3d{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window)
)
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
service: VSHNMinio
severity: warning
slo: 'true'
sloth_severity: ticket
syn: 'true'
syn_component: appcat
syn_team: schedar
Original file line number Diff line number Diff line change
Expand Up @@ -166,15 +166,9 @@ spec:
)
for: 6m
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
service: VSHNMinio
severity: critical
slo: 'true'
sloth_severity: page
syn: 'true'
syn_component: appcat
syn_team: schedar
- alert: SLO_AppCat_HAVSHNMinioUptime
annotations:
runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-minio.html#uptime
Expand All @@ -194,12 +188,6 @@ spec:
max(slo:sli_error:ratio_rate3d{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window)
)
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
service: VSHNMinio
severity: warning
slo: 'true'
sloth_severity: ticket
syn: 'true'
syn_component: appcat
syn_team: schedar
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
syn: 'true'
syn_component: appcat
syn_team: schedar
name: vshn-vshnpostgresql-sla
namespace: appcat-slos
spec:
groups:
- name: appcat-vshnpostgresql-sla-target
rules:
- alert: vshn-vshnpostgresql-sla
expr: rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL",
ha="false", maintenance="false"}[5m]) > 0.2 and rate(appcat_probes_seconds_count{reason!="success",
service="VSHNPostgreSQL", ha="false", maintenance="false"}[1m]) > 0.75
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
runbook: https://kb.vshn.ch/app-catalog/how-tos/appcat/GuaranteedUptimeTarget.html
service: VSHNPostgreSQL
severity: critical
syn: 'true'
syn_component: appcat
syn_team: schedar
- alert: vshn-vshnpostgresql-sla-ha
expr: rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL",
ha="true"}[5m]) > 0.2 and rate(appcat_probes_seconds_count{reason!="success",
service="VSHNPostgreSQL", ha="true"}[1m]) > 0.75
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
runbook: https://kb.vshn.ch/app-catalog/how-tos/appcat/GuaranteedUptimeTarget.html
service: VSHNPostgreSQL
severity: critical
syn: 'true'
syn_component: appcat
syn_team: schedar
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
syn: 'true'
syn_component: appcat
syn_team: schedar
name: vshn-vshnredis-sla
namespace: appcat-slos
spec:
groups:
- name: appcat-vshnredis-sla-target
rules:
- alert: vshn-vshnredis-sla
expr: rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis",
ha="false", maintenance="false"}[5m]) > 0.2 and rate(appcat_probes_seconds_count{reason!="success",
service="VSHNRedis", ha="false", maintenance="false"}[1m]) > 0.75
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
runbook: https://kb.vshn.ch/app-catalog/how-tos/appcat/GuaranteedUptimeTarget.html
service: VSHNRedis
severity: critical
syn: 'true'
syn_component: appcat
syn_team: schedar
- alert: vshn-vshnredis-sla-ha
expr: rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis",
ha="true"}[5m]) > 0.2 and rate(appcat_probes_seconds_count{reason!="success",
service="VSHNRedis", ha="true"}[1m]) > 0.75
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
runbook: https://kb.vshn.ch/app-catalog/how-tos/appcat/GuaranteedUptimeTarget.html
service: VSHNRedis
severity: critical
syn: 'true'
syn_component: appcat
syn_team: schedar
Original file line number Diff line number Diff line change
Expand Up @@ -166,15 +166,9 @@ spec:
)
for: 6m
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
service: VSHNPostgreSQL
severity: critical
slo: 'true'
sloth_severity: page
syn: 'true'
syn_component: appcat
syn_team: schedar
- alert: SLO_AppCat_VSHNPostgreSQLUptime
annotations:
runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#uptime
Expand All @@ -194,12 +188,6 @@ spec:
max(slo:sli_error:ratio_rate3d{sloth_id="appcat-vshn-postgresql-uptime", sloth_service="appcat-vshn-postgresql", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window)
)
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
service: VSHNPostgreSQL
severity: warning
slo: 'true'
sloth_severity: ticket
syn: 'true'
syn_component: appcat
syn_team: schedar
Original file line number Diff line number Diff line change
Expand Up @@ -166,15 +166,9 @@ spec:
)
for: 6m
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
service: VSHNPostgreSQL
severity: critical
slo: 'true'
sloth_severity: page
syn: 'true'
syn_component: appcat
syn_team: schedar
- alert: SLO_AppCat_HAVSHNPostgreSQLUptime
annotations:
runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#uptime
Expand All @@ -194,12 +188,6 @@ spec:
max(slo:sli_error:ratio_rate3d{sloth_id="appcat-vshn-postgresql-ha-uptime", sloth_service="appcat-vshn-postgresql-ha", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window)
)
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
service: VSHNPostgreSQL
severity: warning
slo: 'true'
sloth_severity: ticket
syn: 'true'
syn_component: appcat
syn_team: schedar
Loading

0 comments on commit d08db17

Please sign in to comment.