From e5a4adb1c03ba1680a8c0af10bc85548b3acfdab Mon Sep 17 00:00:00 2001 From: Kenny Leung Date: Thu, 19 Dec 2024 13:19:44 -0800 Subject: [PATCH] alerts to catch 4xx type errors Signed-off-by: Kenny Leung --- modules/alerting/README.md | 4 ++ modules/alerting/main.tf | 101 ++++++++++++++++++++++++++++++++++ modules/alerting/variables.tf | 12 ++++ 3 files changed, 117 insertions(+) diff --git a/modules/alerting/README.md b/modules/alerting/README.md index 7afa89b8..7a016832 100644 --- a/modules/alerting/README.md +++ b/modules/alerting/README.md @@ -28,6 +28,8 @@ No modules. | [google_monitoring_alert_policy.cloud-run-scaling-failure](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource | | [google_monitoring_alert_policy.cloudrun_timeout](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource | | [google_monitoring_alert_policy.fatal](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource | +| [google_monitoring_alert_policy.grpc_error_rate](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource | +| [google_monitoring_alert_policy.http_error_rate](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource | | [google_monitoring_alert_policy.oom](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource | | [google_monitoring_alert_policy.panic](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource | | [google_monitoring_alert_policy.panic-stacktrace](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource | @@ -48,6 +50,8 @@ No modules. | [failure\_rate\_exclude\_services](#input\_failure\_rate\_exclude\_services) | List of service names to exclude from the 5xx failure rate alert | `list(string)` | `[]` | no | | [failure\_rate\_ratio\_threshold](#input\_failure\_rate\_ratio\_threshold) | ratio threshold to alert for cloud run server failure rate. | `number` | `0.2` | no | | [global\_only\_alerts](#input\_global\_only\_alerts) | only enable global alerts. when true, only create alerts that are global. | `bool` | `false` | no | +| [grpc\_error\_threshold](#input\_grpc\_error\_threshold) | threshold for grpc error. | `number` | `0.25` | no | +| [http\_error\_threshold](#input\_http\_error\_threshold) | threshold for http error. | `number` | `0.25` | no | | [notification\_channels](#input\_notification\_channels) | List of notification channels to alert. | `list(string)` | `[]` | no | | [notification\_channels\_email](#input\_notification\_channels\_email) | Email notification channel. | `list(string)` | `[]` | no | | [notification\_channels\_pagerduty](#input\_notification\_channels\_pagerduty) | Email notification channel. | `list(string)` | `[]` | no | diff --git a/modules/alerting/main.tf b/modules/alerting/main.tf index 052dd5dc..f0cfdaeb 100644 --- a/modules/alerting/main.tf +++ b/modules/alerting/main.tf @@ -15,6 +15,7 @@ locals { locals { squad_log_filter = var.squad == "" ? "" : "labels.squad=\"${var.squad}\"" name = var.squad == "" ? "global" : var.squad + metric_filter = var.squad == "" ? "" : "metric.labels.team=\"${var.squad}\"" } locals { @@ -916,3 +917,103 @@ resource "google_monitoring_alert_policy" "pinned" { enabled = "true" project = var.project_id } + +resource "google_monitoring_alert_policy" "http_error_rate" { + count = var.global_only_alerts ? 0 : 1 + + alert_strategy { + auto_close = "3600s" // 1 hour + } + + combiner = "OR" + + conditions { + condition_threshold { + aggregations { + alignment_period = "60s" + cross_series_reducer = "REDUCE_MEAN" + per_series_aligner = "ALIGN_RATE" + group_by_fields = [ + "metric.label.team", + "metric.label.service_name", + ] + } + + comparison = "COMPARISON_GT" + duration = "300s" + # ignore registry service - valid 4xx use cases + # ignore prober - handled by prober alerts + # ignore 2xx and 3xx, only care 4xx and 5xx + filter = <