Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add env to queries, improve titles, fix queries #5

Merged
merged 2 commits into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 14 additions & 14 deletions aws/alb/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@ locals {
monitor_warn_default_priority = null
monitor_nodata_default_priority = null

title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}"
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
}

resource "datadog_monitor" "http_5xx_responses" {
count = var.http_5xx_responses_enabled ? 1 : 0

name = join("", [local.title_prefix, "ALB 5xx Responses - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "ALB 5xx Responses - {{loadbalancer.name}}", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -27,8 +27,8 @@ resource "datadog_monitor" "http_5xx_responses" {

query = <<END
min(${var.http_5xx_responses_evaluation_window}):
default(avg:aws.applicationelb.httpcode_elb_5xx${local.query_filter} by {loadbalancer,region,aws_account}.as_rate(), 0) / (
default(avg:aws.applicationelb.request_count${local.query_filter} by {loadbalancer,region,aws_account}.as_rate(), 1)
default(avg:aws.applicationelb.httpcode_elb_5xx${local.query_filter} by {aws_account,env,loadbalancer,region}.as_rate(), 0) / (
default(avg:aws.applicationelb.request_count${local.query_filter} by {aws_account,env,loadbalancer,region}.as_rate(), 1)
) * 100 > ${var.http_5xx_responses_threshold_critical}
END

Expand All @@ -41,7 +41,7 @@ END
resource "datadog_monitor" "http_5xx_tg_responses" {
count = var.http_5xx_tg_responses_enabled ? 1 : 0

name = join("", [local.title_prefix, "ALB Target Group 5xx Responses - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "ALB Target Group 5xx Responses - {{loadbalancer.name}}", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -57,8 +57,8 @@ resource "datadog_monitor" "http_5xx_tg_responses" {

query = <<END
min(${var.http_5xx_tg_responses_evaluation_window}):
default(avg:aws.applicationelb.httpcode_elb_5xx${local.query_filter} by {loadbalancer,region,aws_account,targetgroup}.as_rate(), 0) / (
default(avg:aws.applicationelb.request_count${local.query_filter} by {loadbalancer,region,aws_account,targetgroup}.as_rate(), 1)
default(avg:aws.applicationelb.httpcode_elb_5xx${local.query_filter} by {loadbalancer,region,aws_account,targetgroup,env}.as_rate(), 0) / (
default(avg:aws.applicationelb.request_count${local.query_filter} by {loadbalancer,region,aws_account,targetgroup,env}.as_rate(), 1)
) * 100 > ${var.http_5xx_tg_responses_threshold_critical}
END

Expand All @@ -72,7 +72,7 @@ END
resource "datadog_monitor" "latency" {
count = var.latency_enabled ? 1 : 0

name = join("", [local.title_prefix, "ALB latency - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "{{loadbalancer.name}} ALB latency - {{value}}s ", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -88,7 +88,7 @@ resource "datadog_monitor" "latency" {

query = <<END
avg(${var.latency_evaluation_window}):
default(avg:aws.applicationelb.target_response_time.average${local.query_filter} by {loadbalancer,region,aws_account}, 0
default(avg:aws.applicationelb.target_response_time.average${local.query_filter} by {aws_account,env,loadbalancer,region}, 0
) > ${var.latency_threshold_critical}
END

Expand All @@ -101,7 +101,7 @@ END
resource "datadog_monitor" "no_healthy_instances" {
count = var.no_healthy_instances_enabled ? 1 : 0

name = join("", [local.title_prefix, "ALB healthy instances - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "{{loadbalancer.name}} ALB healthy instances is at {{value}}%", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -117,10 +117,10 @@ resource "datadog_monitor" "no_healthy_instances" {

query = <<END
min(${var.no_healthy_instances_evaluation_window}): (
sum:aws.applicationelb.healthy_host_count.minimum${local.query_filter} by {loadbalancer,region,aws_account} / (
sum:aws.applicationelb.healthy_host_count.minimum${local.query_filter} by {loadbalancer,region,aws_account} +
sum:aws.applicationelb.un_healthy_host_count.maximum${local.query_filter} by {loadbalancer,region,aws_account} )
) <= ${var.no_healthy_instances_threshold_critical}
sum:aws.applicationelb.healthy_host_count.minimum${local.query_filter} by {aws_account,env,region,loadbalancer} / (
sum:aws.applicationelb.healthy_host_count.minimum${local.query_filter} by {aws_account,env,region,loadbalancer} +
sum:aws.applicationelb.un_healthy_host_count.maximum${local.query_filter} by {aws_account,env,region,loadbalancer} )
) * 100 <= ${var.no_healthy_instances_threshold_critical}
END

monitor_thresholds {
Expand Down
2 changes: 1 addition & 1 deletion aws/apigateway/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ locals {
monitor_warn_default_priority = null
monitor_nodata_default_priority = null

title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}"
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
}

Expand Down
2 changes: 1 addition & 1 deletion aws/beanstalk/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ locals {

latency_metric = local.latency_metric_map[var.latency_measurement]

title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}"
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
}

Expand Down
18 changes: 9 additions & 9 deletions aws/ec2/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@ locals {
monitor_warn_default_priority = null
monitor_nodata_default_priority = null

title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}"
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
}

resource "datadog_monitor" "status_failed_check" {
count = var.status_failed_check_enabled ? 1 : 0

name = join("", [local.title_prefix, "EC2 instance status - status check failure - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "EC2 instance status - status check failure - {{name.name}}({{instance_id.name}})", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -26,7 +26,7 @@ resource "datadog_monitor" "status_failed_check" {

query = <<END
max(${var.status_failed_check_evaluation_window}):
max:aws.ec2.status_check_failed${local.query_filter} by {instance_id,region,aws_account}
max:aws.ec2.status_check_failed${local.query_filter} by {aws_account,env,instance_id,name,region}
>= 1
END

Expand All @@ -38,7 +38,7 @@ END
resource "datadog_monitor" "status_failed_instance" {
count = var.status_failed_instance_enabled ? 1 : 0

name = join("", [local.title_prefix, "EC2 instance status - instance failure - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "EC2 instance status - instance failure - {{name.name}}({{instance_id.name}})", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -53,7 +53,7 @@ resource "datadog_monitor" "status_failed_instance" {

query = <<END
max(${var.status_failed_instance_evaluation_window}):
max:aws.ec2.status_check_failed_instance${local.query_filter} by {instance_id,region,aws_account}
max:aws.ec2.status_check_failed_instance${local.query_filter} by {aws_account,env,instance_id,name,region}
>= 1
END

Expand All @@ -65,7 +65,7 @@ END
resource "datadog_monitor" "status_failed_system" {
count = var.status_failed_system_enabled ? 1 : 0

name = join("", [local.title_prefix, "EC2 instance status - host failure - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "EC2 instance status - host failure - {{name.name}}({{instance_id.name}})", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -80,7 +80,7 @@ resource "datadog_monitor" "status_failed_system" {

query = <<END
max(${var.status_failed_system_evaluation_window}):
max:aws.ec2.status_check_failed_system${local.query_filter} by {instance_id,region,aws_account}
max:aws.ec2.status_check_failed_system${local.query_filter} by {aws_account,env,instance_id,name,region}
>= 1
END

Expand All @@ -92,7 +92,7 @@ END
resource "datadog_monitor" "status_failed_volume" {
count = var.status_failed_volume_enabled ? 1 : 0

name = join("", [local.title_prefix, "EC2 instance status - volume failure - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "EC2 instance status - volume failure - {{name.name}}({{instance_id.name}})", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -107,7 +107,7 @@ resource "datadog_monitor" "status_failed_volume" {

query = <<END
max(${var.status_failed_volume_evaluation_window}):
max:aws.ec2.status_check_failed_attached_ebs${local.query_filter} by {instance_id,region,aws_account}
max:aws.ec2.status_check_failed_attached_ebs${local.query_filter} by {aws_account,env,instance_id,name,region}
>= 1
END

Expand Down
26 changes: 12 additions & 14 deletions aws/ecs-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,18 @@ locals {
monitor_warn_default_priority = null
monitor_nodata_default_priority = null

title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}"
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
}

resource "datadog_monitor" "agent_status" {
count = var.agent_status_enabled ? 1 : 0

name = join("", [local.title_prefix, "ECS Cluster Agent Status - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "ECS Agent disconnected - {{clustername.name}}", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
type = "query alert"
type = "service check"

evaluation_delay = var.evaluation_delay
new_group_delay = var.new_group_delay
Expand All @@ -26,11 +26,9 @@ resource "datadog_monitor" "agent_status" {
require_full_window = true
timeout_h = var.timeout_h

query = <<END
min(${var.agent_status_evaluation_window}):
aws.ecs.agent_connected${local.service_filter}.by("cluster", "instance_id").last(6).count_by_status()
>= ${var.agent_status_threshold_critical}
END
query = <<EOQ
"aws.ecs.agent_connected"${local.service_filter}.by("clustername","instance_id").last(6).count_by_status()
EOQ

monitor_thresholds {
critical = var.agent_status_threshold_critical
Expand All @@ -41,7 +39,7 @@ END
resource "datadog_monitor" "cpu_utilization" {
count = var.cpu_utilization_enabled ? 1 : 0

name = join("", [local.title_prefix, "ECS Cluster CPU Utilization - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "ECS Cluster CPU Utilization - {{clustername.name}} - {{value}}%", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -57,7 +55,7 @@ resource "datadog_monitor" "cpu_utilization" {

query = <<END
min(${var.cpu_utilization_evaluation_window}):
avg:aws.ecs.cluster.cpuutilization${local.query_filter} by {clustername,region,aws_account}
avg:aws.ecs.cluster.cpuutilization${local.query_filter} by {clustername,region,aws_account,env}
> ${var.cpu_utilization_threshold_critical}
END

Expand All @@ -70,7 +68,7 @@ END
resource "datadog_monitor" "cpu_utilization_anomaly" {
count = var.cpu_utilization_anomaly_enabled ? 1 : 0

name = join("", [local.title_prefix, "ECS cluster CPU utilization anomalous activity - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "ECS cluster CPU utilization anomalous activity - {{clustername.name}}", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -86,7 +84,7 @@ resource "datadog_monitor" "cpu_utilization_anomaly" {

query = <<END
avg(${var.cpu_utilization_anomaly_evaluation_window}):anomalies(
avg:aws.ecs.cluster.cpuutilization${local.query_filter} by {clustername,region,aws_account}, 'agile', ${var.cpu_utilization_anomaly_deviations},
avg:aws.ecs.cluster.cpuutilization${local.query_filter} by {clustername,region,aws_account,env}, 'agile', ${var.cpu_utilization_anomaly_deviations},
direction='above', count_default_zero='true', interval=${var.cpu_utilization_anomaly_rollup},
seasonality='${var.cpu_utilization_anomaly_seasonality}'
) >= ${var.cpu_utilization_anomaly_threshold_critical}
Expand All @@ -106,7 +104,7 @@ END
resource "datadog_monitor" "memory_reservation" {
count = var.memory_reservation_enabled ? 1 : 0

name = join("", [local.title_prefix, "ECS Cluster CPU Reservation - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "ECS Cluster Memory Reservation High - {{clustername.name}} - {{value}}%", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -122,7 +120,7 @@ resource "datadog_monitor" "memory_reservation" {

query = <<END
min(${var.memory_reservation_evaluation_window}):
avg:aws.ecs.cluster.memory_reservation${local.query_filter} by {clustername,region,aws_account}
avg:aws.ecs.cluster.memory_reservation${local.query_filter} by {clustername,region,aws_account,env}
> ${var.memory_reservation_threshold_critical}
END

Expand Down
18 changes: 9 additions & 9 deletions aws/ecs-fargate/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@ locals {
monitor_warn_default_priority = null
monitor_nodata_default_priority = null

title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}"
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
}

resource "datadog_monitor" "fargate_check" {
count = var.fargate_check_enabled ? 1 : 0

name = join("", [local.title_prefix, "ECS Fargate task status check - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "Fargate service not responding", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand Down Expand Up @@ -40,7 +40,7 @@ END
resource "datadog_monitor" "cpu_utilization" {
count = var.cpu_utilization_enabled ? 1 : 0

name = join("", [local.title_prefix, "ECS Fargate task CPU utilization - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "ECS Fargate task CPU utilization", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -56,7 +56,7 @@ resource "datadog_monitor" "cpu_utilization" {

query = <<END
avg(${var.cpu_utilization_evaluation_window}):
avg:ecs.fargate.cpu.percent${local.query_filter} by {ecs_container_name,task_family,region,aws_account}
avg:ecs.fargate.cpu.percent${local.query_filter} by {ecs_container_name,task_family,region,aws_account,env}
> ${var.cpu_utilization_threshold_critical}
END

Expand All @@ -69,7 +69,7 @@ END
resource "datadog_monitor" "cpu_utilization_anomaly" {
count = var.cpu_utilization_anomaly_enabled ? 1 : 0

name = join("", [local.title_prefix, "ECS service CPU utilization anomalous activity - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "ECS service CPU utilization anomalous activity", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -85,7 +85,7 @@ resource "datadog_monitor" "cpu_utilization_anomaly" {

query = <<END
avg(${var.cpu_utilization_anomaly_evaluation_window}):anomalies(
avg:ecs.fargate.cpu.percent${local.query_filter} by {servicename,region,aws_account}, 'agile', ${var.cpu_utilization_anomaly_deviations},
avg:ecs.fargate.cpu.percent${local.query_filter} by {servicename,region,aws_account,env}, 'agile', ${var.cpu_utilization_anomaly_deviations},
direction='above', count_default_zero='true', interval=${var.cpu_utilization_anomaly_rollup},
seasonality='${var.cpu_utilization_anomaly_seasonality}'
) >= ${var.cpu_utilization_anomaly_threshold_critical}
Expand All @@ -105,7 +105,7 @@ END
resource "datadog_monitor" "memory_utilization" {
count = var.memory_utilization_enabled ? 1 : 0

name = join("", [local.title_prefix, "ECS Fargate task memory utilization - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "ECS Fargate task memory utilization", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -121,8 +121,8 @@ resource "datadog_monitor" "memory_utilization" {

query = <<END
avg(${var.memory_utilization_evaluation_window}):(
avg:ecs.fargate.mem.usage${local.query_filter} by {ecs_container_name,task_family,region,aws_account} /
avg:ecs.fargate.mem.limit${local.query_filter} by {ecs_container_name,task_family,region,aws_account}
avg:ecs.fargate.mem.usage${local.query_filter} by {ecs_container_name,task_family,region,aws_account,env} /
avg:ecs.fargate.mem.limit${local.query_filter} by {ecs_container_name,task_family,region,aws_account,env}
) >= ${var.memory_utilization_threshold_critical}
END

Expand Down
Loading
Loading