Skip to content

Commit

Permalink
Merge pull request #175 from vshn/pg/add_max_connections_alert
Browse files Browse the repository at this point in the history
Add max connection alert
  • Loading branch information
Kidswiss authored Jun 5, 2024
2 parents d00e7d3 + e1b4bda commit b3b9c6b
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 19 deletions.
30 changes: 15 additions & 15 deletions pkg/comp-functions/functions/common/nonsla/alerting.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ type Alerts struct {
}

const (
synTeam string = "schedar"
severityCritical string = "critical"
minuteInterval, hourInterval, twoHourInterval promV1.Duration = "1m", "1h", "2h"
SynTeam string = "schedar"
SeverityCritical string = "critical"
MinuteInterval, HourInterval, TwoHourInterval promV1.Duration = "1m", "1h", "2h"
)

var (
Expand All @@ -33,7 +33,7 @@ var (

pvFillUp: func(name, namespace string) promV1.Rule {
return promV1.Rule{
Alert: name + "PersistentVolumeFillingUp",
Alert: "PersistentVolumeFillingUp",
Annotations: map[string]string{
"description": "The volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is only {{ $value | humanizePercentage }} free.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup",
Expand All @@ -43,16 +43,16 @@ var (
Type: intstr.String,
StrVal: "label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"} / kubelet_volume_stats_capacity_bytes{job=\"kubelet\",metrics_path=\"/metrics\"}) < 0.03 and kubelet_volume_stats_used_bytes{job=\"kubelet\",metrics_path=\"/metrics\"} > 0 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode=\"ReadOnlyMany\"} == 1 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"}== 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace)kube_namespace_labels, \"name\", \"$1\", \"namespace\",\"vshn-" + namespace + "-(.+)-.+\")",
},
For: minuteInterval,
For: MinuteInterval,
Labels: map[string]string{
"severity": severityCritical,
"syn_team": synTeam,
"severity": SeverityCritical,
"syn_team": SynTeam,
},
}
},
pvExpectedFillUp: func(name, namespace string) promV1.Rule {
return promV1.Rule{
Alert: name + "PersistentVolumeExpectedToFillUp",
Alert: "PersistentVolumeExpectedToFillUp",
Annotations: map[string]string{
"description": "Based on recent sampling, the volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup",
Expand All @@ -62,16 +62,16 @@ var (
Type: intstr.String,
StrVal: "label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job=\"kubelet\",metrics_path=\"/metrics\"} / kubelet_volume_stats_capacity_bytes{job=\"kubelet\",metrics_path=\"/metrics\"}) < 0.15 and kubelet_volume_stats_used_bytes{job=\"kubelet\",metrics_path=\"/metrics\"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\",metrics_path=\"/metrics\"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode=\"ReadOnlyMany\"} == 1 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"}== 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace)kube_namespace_labels, \"name\", \"$1\", \"namespace\",\"vshn-" + namespace + "-(.+)-.+\")",
},
For: hourInterval,
For: HourInterval,
Labels: map[string]string{
"severity": severityCritical,
"syn_team": synTeam,
"severity": SeverityCritical,
"syn_team": SynTeam,
},
}
},
memCritical: func(name, namespace string) promV1.Rule {
return promV1.Rule{
Alert: name + "MemoryCritical",
Alert: "MemoryCritical",
Annotations: map[string]string{
"description": "The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours.\n Please reduce the load of this instance, or increase the memory.",
"runbook_url": "https://hub.syn.tools/appcat/runbooks/vshn-generic.html#MemoryCritical",
Expand All @@ -81,10 +81,10 @@ var (
Type: intstr.String,
StrVal: "label_replace( topk(1, (max(container_memory_working_set_bytes{container=\"" + name + "\"})without (name, id) / on(container,pod,namespace) kube_pod_container_resource_limits{resource=\"memory\"}* 100) > 85) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace)kube_namespace_labels, \"name\", \"$1\", \"namespace\",\"vshn-" + namespace + "-(.+)-.+\")",
},
For: twoHourInterval,
For: TwoHourInterval,
Labels: map[string]string{
"severity": severityCritical,
"syn_team": synTeam,
"severity": SeverityCritical,
"syn_team": SynTeam,
},
}
},
Expand Down
6 changes: 3 additions & 3 deletions pkg/comp-functions/functions/common/nonsla/alerting_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,15 @@ func TestNewAlertSetBuilder(t *testing.T) {

checkCount := 0
for _, rule := range rules {
if rule.Alert == containerName+"PersistentVolumeExpectedToFillUp" {
if rule.Alert == "PersistentVolumeExpectedToFillUp" {
assert.Equal(t, patroniPersistentVolumeExpectedToFillUp, rule.Expr.StrVal)
checkCount++
}
if rule.Alert == containerName+"MemoryCritical" {
if rule.Alert == "MemoryCritical" {
assert.Equal(t, patroniMemoryCritical, rule.Expr.StrVal)
checkCount++
}
if rule.Alert == containerName+"PersistentVolumeFillingUp" {
if rule.Alert == "PersistentVolumeFillingUp" {
assert.Equal(t, patroniPersistentVolumeFillingUp, rule.Expr.StrVal)
checkCount++
}
Expand Down
25 changes: 25 additions & 0 deletions pkg/comp-functions/functions/vshnpostgres/alerts.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package vshnpostgres

import (
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"github.com/vshn/appcat/v4/pkg/comp-functions/functions/common/nonsla"
"k8s.io/apimachinery/pkg/util/intstr"
)

var maxConnectionsAlert = promv1.Rule{
Alert: "PostgreSQLConnectionsCritical",
Annotations: map[string]string{
"description": "The connections to {{ $labels.pod }} have been over 90% of the configured connections for 2 hours.\n Please reduce the load of this instance.",
"runbook_url": "https://kb.vshn.ch/app-catalog/how-tos/appcat/vshn/postgres/PostgreSQLConnectionsCritical.html",
"summary": "Connection usage critical",
},
Expr: intstr.IntOrString{
Type: intstr.String,
StrVal: "sum(pg_stat_activity_count) by (pod)\n > 90/100 * sum(pg_settings_max_connections) by (pod)",
},
For: nonsla.TwoHourInterval,
Labels: map[string]string{
"severity": nonsla.SeverityCritical,
"syn_team": nonsla.SynTeam,
},
}
5 changes: 4 additions & 1 deletion pkg/comp-functions/functions/vshnpostgres/register.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
package vshnpostgres

import (
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
vshnv1 "github.com/vshn/appcat/v4/apis/vshn/v1"
"github.com/vshn/appcat/v4/pkg/comp-functions/functions/common"
"github.com/vshn/appcat/v4/pkg/comp-functions/functions/common/nonsla"
"github.com/vshn/appcat/v4/pkg/comp-functions/runtime"
)

var pgAlerts = nonsla.NewAlertSetBuilder("patroni", "postgresql").AddAll().AddCustom([]promv1.Rule{maxConnectionsAlert}).GetAlerts()

func init() {
runtime.RegisterService("postgresql", runtime.Service{
Steps: []runtime.Step{
Expand Down Expand Up @@ -64,7 +67,7 @@ func init() {
},
{
Name: "non-sla-prometheus-rules",
Execute: nonsla.GenerateNonSLAPromRules(&vshnv1.VSHNPostgreSQL{}, nonsla.NewAlertSetBuilder("patroni", "postgresql").AddAll().GetAlerts()),
Execute: nonsla.GenerateNonSLAPromRules(&vshnv1.VSHNPostgreSQL{}, pgAlerts),
},
{
Name: "pgbouncer-settings",
Expand Down

0 comments on commit b3b9c6b

Please sign in to comment.