From ca3ed5b9e11f64e8489838c8952f1271dae717bc Mon Sep 17 00:00:00 2001 From: Lalo Galvan <106835113+lalo-galvan@users.noreply.github.com> Date: Thu, 6 Jun 2024 13:48:57 -0600 Subject: [PATCH 1/6] add new generic host monitors --- common/common.tf | 2 + host/agent/.terraform.lock.hcl | 44 +++++++++ host/agent/README.md | 0 host/agent/common.tf | 1 + host/agent/main.tf | 35 ++++++++ host/agent/variables.tf | 23 +++++ host/agent/versions.tf | 1 + host/cpu/.terraform.lock.hcl | 44 +++++++++ host/cpu/README.md | 0 host/cpu/common.tf | 1 + host/cpu/main.tf | 37 ++++++++ host/cpu/variables.tf | 53 +++++++++++ host/cpu/versions.tf | 1 + host/disk/.terraform.lock.hcl | 44 +++++++++ host/disk/README.md | 0 host/disk/common.tf | 1 + host/disk/main.tf | 100 +++++++++++++++++++++ host/disk/variables.tf | 149 +++++++++++++++++++++++++++++++ host/disk/versions.tf | 1 + host/memory/.terraform.lock.hcl | 44 +++++++++ host/memory/README.md | 0 host/memory/common.tf | 1 + host/memory/main.tf | 39 ++++++++ host/memory/variables.tf | 47 ++++++++++ host/memory/versions.tf | 1 + host/process/.terraform.lock.hcl | 44 +++++++++ host/process/README.md | 0 host/process/common.tf | 1 + host/process/main.tf | 37 ++++++++ host/process/variables.tf | 53 +++++++++++ host/process/versions.tf | 1 + host/swap/.terraform.lock.hcl | 44 +++++++++ host/swap/README.md | 0 host/swap/common.tf | 1 + host/swap/main.tf | 38 ++++++++ host/swap/variables.tf | 47 ++++++++++ host/swap/versions.tf | 1 + 37 files changed, 936 insertions(+) create mode 100644 host/agent/.terraform.lock.hcl create mode 100644 host/agent/README.md create mode 120000 host/agent/common.tf create mode 100644 host/agent/main.tf create mode 100644 host/agent/variables.tf create mode 120000 host/agent/versions.tf create mode 100644 host/cpu/.terraform.lock.hcl create mode 100644 host/cpu/README.md create mode 120000 host/cpu/common.tf create mode 100644 host/cpu/main.tf create mode 100644 host/cpu/variables.tf create mode 120000 host/cpu/versions.tf create mode 100644 host/disk/.terraform.lock.hcl create mode 100644 host/disk/README.md create mode 120000 host/disk/common.tf create mode 100644 host/disk/main.tf create mode 100644 host/disk/variables.tf create mode 120000 host/disk/versions.tf create mode 100644 host/memory/.terraform.lock.hcl create mode 100644 host/memory/README.md create mode 120000 host/memory/common.tf create mode 100644 host/memory/main.tf create mode 100644 host/memory/variables.tf create mode 120000 host/memory/versions.tf create mode 100644 host/process/.terraform.lock.hcl create mode 100644 host/process/README.md create mode 120000 host/process/common.tf create mode 100644 host/process/main.tf create mode 100644 host/process/variables.tf create mode 120000 host/process/versions.tf create mode 100644 host/swap/.terraform.lock.hcl create mode 100644 host/swap/README.md create mode 120000 host/swap/common.tf create mode 100644 host/swap/main.tf create mode 100644 host/swap/variables.tf create mode 120000 host/swap/versions.tf diff --git a/common/common.tf b/common/common.tf index dab8e0b..a770728 100644 --- a/common/common.tf +++ b/common/common.tf @@ -1,3 +1,5 @@ +# tflint-ignore-file: terraform_unused_declarations + ######################################## # Tag Related Vars ######################################## diff --git a/host/agent/.terraform.lock.hcl b/host/agent/.terraform.lock.hcl new file mode 100644 index 0000000..0791549 --- /dev/null +++ b/host/agent/.terraform.lock.hcl @@ -0,0 +1,44 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/datadog/datadog" { + version = "3.39.0" + constraints = ">= 3.37.0" + hashes = [ + "h1:6rSVMI3RBWRHA70wH8eIGyzH1IZHu1hwdixT+UbD9S0=", + "zh:3389298e5116232edd1c6e64c7a83f696d5663f1a1c7da5bb32be3472faa4a9e", + "zh:39b5d91e58372716d9c7d4869e8391612d3799a936f7c94cc9b3efbe08f4a6d9", + "zh:4a077105269fd187fc751a390d3e894092a2dd1b44e7e76ff9923968d971dc6a", + "zh:4ddfc1d5839eb4b88bdf3c620bc7f3a94c850d5e5a21b8c818bba63c4f264d24", + "zh:65d076f03bbaa2782c7487411bda925c14ee4841aefd0eda5442c5e4e73856cd", + "zh:7e357a7fe969c7ccbcb755799b95612f9e0bd835cfd993b05b5303c6dde1ffbd", + "zh:a1f11d390762eddaeef701c9d2747c8a9e113afa7fe7284210d62a21d1113f6e", + "zh:a7a17b04e66d7b13dca6240dab393a3dffec4429fc685fdeabd7113064cd8331", + "zh:af60a0062adcf35d89b159a45f1ce77ad8ea196bccaea1d1e17ab8caffbe859a", + "zh:bf8e5836cb7cbc82fae10b874d137fd5e804c7308ce05c76eaebee6e5b26123e", + "zh:c1c4742ea3be506124a494bdc9d2f4ec777e98e0ad6add9b6e81d4703dc4ac1e", + "zh:d121a6fcb947c674026b35751f10ffe4941ab3c163d4c0f210d053e0984f11e6", + "zh:d99b1c20a343c4cf7c4a2117e9b29eb8056d9f481c52d19500c037c8b6e87878", + "zh:e1338c712edb4a87b7ebeb9bd4f7ae72dc3b7574fdac45bc8e2d0cef784e0597", + ] +} + +provider "registry.terraform.io/hashicorp/null" { + version = "3.2.2" + constraints = ">= 3.1.0" + hashes = [ + "h1:IMVAUHKoydFrlPrl9OzasDnw/8ntZFerCC9iXw1rXQY=", + "zh:3248aae6a2198f3ec8394218d05bd5e42be59f43a3a7c0b71c66ec0df08b69e7", + "zh:32b1aaa1c3013d33c245493f4a65465eab9436b454d250102729321a44c8ab9a", + "zh:38eff7e470acb48f66380a73a5c7cdd76cc9b9c9ba9a7249c7991488abe22fe3", + "zh:4c2f1faee67af104f5f9e711c4574ff4d298afaa8a420680b0cb55d7bbc65606", + "zh:544b33b757c0b954dbb87db83a5ad921edd61f02f1dc86c6186a5ea86465b546", + "zh:696cf785090e1e8cf1587499516b0494f47413b43cb99877ad97f5d0de3dc539", + "zh:6e301f34757b5d265ae44467d95306d61bef5e41930be1365f5a8dcf80f59452", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:913a929070c819e59e94bb37a2a253c228f83921136ff4a7aa1a178c7cce5422", + "zh:aa9015926cd152425dbf86d1abdbc74bfe0e1ba3d26b3db35051d7b9ca9f72ae", + "zh:bb04798b016e1e1d49bcc76d62c53b56c88c63d6f2dfe38821afef17c416a0e1", + "zh:c23084e1b23577de22603cff752e59128d83cfecc2e6819edadd8cf7a10af11e", + ] +} diff --git a/host/agent/README.md b/host/agent/README.md new file mode 100644 index 0000000..e69de29 diff --git a/host/agent/common.tf b/host/agent/common.tf new file mode 120000 index 0000000..47c0063 --- /dev/null +++ b/host/agent/common.tf @@ -0,0 +1 @@ +../../common/common.tf \ No newline at end of file diff --git a/host/agent/main.tf b/host/agent/main.tf new file mode 100644 index 0000000..0ee09ae --- /dev/null +++ b/host/agent/main.tf @@ -0,0 +1,35 @@ +locals { + # these must be defined but do not need to be overridden + monitor_alert_default_priority = null + monitor_warn_default_priority = null + monitor_nodata_default_priority = null + + title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] " + title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})" +} + +resource "datadog_monitor" "host_unreachable" { + count = var.host_unreachable_enabled ? 1 : 0 + + name = join("", [local.title_prefix, "Host Unreachable - {{host.name}}", local.title_suffix]) + message = local.query_alert_base_message + tags = concat(local.common_tags, var.base_tags, var.additional_tags) + type = "service check" + + evaluation_delay = var.evaluation_delay + new_group_delay = var.new_group_delay + notify_no_data = false + renotify_interval = var.renotify_interval + require_full_window = true + timeout_h = var.timeout_h + + query = < ${var.cpu_utilization_threshold_critical} + EOQ + + monitor_thresholds { + warning = var.cpu_utilization_threshold_warning + critical = var.cpu_utilization_threshold_critical + } +} diff --git a/host/cpu/variables.tf b/host/cpu/variables.tf new file mode 100644 index 0000000..76fcbd9 --- /dev/null +++ b/host/cpu/variables.tf @@ -0,0 +1,53 @@ +######################################## +# Global variables +######################################## +variable "additional_tags" { + default = [] + description = "Additional tags (key:value format) to add to this type of check (combined with `local.tags` and `var.base_tags`)" + type = list(string) +} + +variable "base_tags" { + default = ["resource:apigateway"] + description = "Base tags (key:value format) to add to this type of check (combined with `local.tags` and `var.additional_tags`, generally you should not change this)" + type = list(string) +} + +######################################## +# CPU Utilization +######################################## +variable "cpu_utilization_enabled" { + default = true + description = "Flag to enable CPU Utilitzation monitor" + type = bool +} + +variable "cpu_utilization_no_data_window" { + default = 10 + description = "No data threshold (in minutes, 0 to disable)" + type = number +} + +variable "cpu_utilization_time_aggregator" { + description = "Monitor aggregator for CPU high [available values: min, max or avg]" + type = string + default = "min" +} + +variable "cpu_utilization_timeframe" { + description = "Monitor timeframe for CPU high [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_1h" +} + +variable "cpu_utilization_threshold_critical" { + default = 90 + description = "Critical threshold (percent)" + type = number +} + +variable "cpu_utilization_threshold_warning" { + default = 80 + description = "Warning threshold (percent)" + type = number +} diff --git a/host/cpu/versions.tf b/host/cpu/versions.tf new file mode 120000 index 0000000..cbeda73 --- /dev/null +++ b/host/cpu/versions.tf @@ -0,0 +1 @@ +../../common/versions.tf \ No newline at end of file diff --git a/host/disk/.terraform.lock.hcl b/host/disk/.terraform.lock.hcl new file mode 100644 index 0000000..0791549 --- /dev/null +++ b/host/disk/.terraform.lock.hcl @@ -0,0 +1,44 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/datadog/datadog" { + version = "3.39.0" + constraints = ">= 3.37.0" + hashes = [ + "h1:6rSVMI3RBWRHA70wH8eIGyzH1IZHu1hwdixT+UbD9S0=", + "zh:3389298e5116232edd1c6e64c7a83f696d5663f1a1c7da5bb32be3472faa4a9e", + "zh:39b5d91e58372716d9c7d4869e8391612d3799a936f7c94cc9b3efbe08f4a6d9", + "zh:4a077105269fd187fc751a390d3e894092a2dd1b44e7e76ff9923968d971dc6a", + "zh:4ddfc1d5839eb4b88bdf3c620bc7f3a94c850d5e5a21b8c818bba63c4f264d24", + "zh:65d076f03bbaa2782c7487411bda925c14ee4841aefd0eda5442c5e4e73856cd", + "zh:7e357a7fe969c7ccbcb755799b95612f9e0bd835cfd993b05b5303c6dde1ffbd", + "zh:a1f11d390762eddaeef701c9d2747c8a9e113afa7fe7284210d62a21d1113f6e", + "zh:a7a17b04e66d7b13dca6240dab393a3dffec4429fc685fdeabd7113064cd8331", + "zh:af60a0062adcf35d89b159a45f1ce77ad8ea196bccaea1d1e17ab8caffbe859a", + "zh:bf8e5836cb7cbc82fae10b874d137fd5e804c7308ce05c76eaebee6e5b26123e", + "zh:c1c4742ea3be506124a494bdc9d2f4ec777e98e0ad6add9b6e81d4703dc4ac1e", + "zh:d121a6fcb947c674026b35751f10ffe4941ab3c163d4c0f210d053e0984f11e6", + "zh:d99b1c20a343c4cf7c4a2117e9b29eb8056d9f481c52d19500c037c8b6e87878", + "zh:e1338c712edb4a87b7ebeb9bd4f7ae72dc3b7574fdac45bc8e2d0cef784e0597", + ] +} + +provider "registry.terraform.io/hashicorp/null" { + version = "3.2.2" + constraints = ">= 3.1.0" + hashes = [ + "h1:IMVAUHKoydFrlPrl9OzasDnw/8ntZFerCC9iXw1rXQY=", + "zh:3248aae6a2198f3ec8394218d05bd5e42be59f43a3a7c0b71c66ec0df08b69e7", + "zh:32b1aaa1c3013d33c245493f4a65465eab9436b454d250102729321a44c8ab9a", + "zh:38eff7e470acb48f66380a73a5c7cdd76cc9b9c9ba9a7249c7991488abe22fe3", + "zh:4c2f1faee67af104f5f9e711c4574ff4d298afaa8a420680b0cb55d7bbc65606", + "zh:544b33b757c0b954dbb87db83a5ad921edd61f02f1dc86c6186a5ea86465b546", + "zh:696cf785090e1e8cf1587499516b0494f47413b43cb99877ad97f5d0de3dc539", + "zh:6e301f34757b5d265ae44467d95306d61bef5e41930be1365f5a8dcf80f59452", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:913a929070c819e59e94bb37a2a253c228f83921136ff4a7aa1a178c7cce5422", + "zh:aa9015926cd152425dbf86d1abdbc74bfe0e1ba3d26b3db35051d7b9ca9f72ae", + "zh:bb04798b016e1e1d49bcc76d62c53b56c88c63d6f2dfe38821afef17c416a0e1", + "zh:c23084e1b23577de22603cff752e59128d83cfecc2e6819edadd8cf7a10af11e", + ] +} diff --git a/host/disk/README.md b/host/disk/README.md new file mode 100644 index 0000000..e69de29 diff --git a/host/disk/common.tf b/host/disk/common.tf new file mode 120000 index 0000000..47c0063 --- /dev/null +++ b/host/disk/common.tf @@ -0,0 +1 @@ +../../common/common.tf \ No newline at end of file diff --git a/host/disk/main.tf b/host/disk/main.tf new file mode 100644 index 0000000..ca4ce0f --- /dev/null +++ b/host/disk/main.tf @@ -0,0 +1,100 @@ +locals { + # these must be defined but do not need to be overridden + monitor_alert_default_priority = null + monitor_warn_default_priority = null + monitor_nodata_default_priority = null + + title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] " + title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})" +} + +resource "datadog_monitor" "disk_space" { + count = var.disk_space_enabled ? 1 : 0 + + name = join("", [local.title_prefix, "Disk Space - {{host.name}}", local.title_suffix]) + message = local.query_alert_base_message + tags = concat(local.common_tags, var.base_tags, var.additional_tags) + type = "query alert" + + evaluation_delay = var.evaluation_delay + new_group_delay = var.new_group_delay + notify_no_data = false + notify_audit = false + timeout_h = var.timeout_h + include_tags = true + require_full_window = true + + query = < ${var.disk_space_threshold_critical} + EOQ + + monitor_thresholds { + warning = var.disk_space_threshold_warning + critical = var.disk_space_threshold_critical + } +} + +resource "datadog_monitor" "disk_space_forecast" { + count = var.disk_space_forecast_enabled ? 1 : 0 + + name = join("", [local.title_prefix, "Disk Space Forecast - {{host.name}}", local.title_suffix]) + message = local.query_alert_base_message + tags = concat(local.common_tags, var.base_tags, var.additional_tags) + type = "query alert" + + evaluation_delay = var.evaluation_delay + new_group_delay = var.new_group_delay + notify_audit = false + timeout_h = var.timeout_h + include_tags = true + require_full_window = true + notify_no_data = false + renotify_interval = 0 + + query = <= ${var.disk_space_forecast_threshold_critical} + EOQ + + monitor_thresholds { + critical_recovery = var.disk_space_forecast_threshold_critical_recovery + critical = var.disk_space_forecast_threshold_critical + } +} + +resource "datadog_monitor" "disk_inodes" { + count = var.disk_inodes_enabled ? 1 : 0 + + name = join("", [local.title_prefix, "Disk Inodes Usage - {{host.name}}", local.title_suffix]) + message = local.query_alert_base_message + tags = concat(local.common_tags, var.base_tags, var.additional_tags) + type = "query alert" + + query = < ${var.disk_inodes_threshold_critical} + EOQ + + evaluation_delay = var.evaluation_delay + new_group_delay = var.new_group_delay + notify_no_data = false + notify_audit = false + timeout_h = var.timeout_h + include_tags = true + require_full_window = true + + monitor_thresholds { + warning = var.disk_inodes_threshold_warning + critical = var.disk_inodes_threshold_critical + } +} diff --git a/host/disk/variables.tf b/host/disk/variables.tf new file mode 100644 index 0000000..1cd809d --- /dev/null +++ b/host/disk/variables.tf @@ -0,0 +1,149 @@ +######################################## +# Global variables +######################################## +variable "additional_tags" { + default = [] + description = "Additional tags (key:value format) to add to this type of check (combined with `local.tags` and `var.base_tags`)" + type = list(string) +} + +variable "base_tags" { + default = ["resource:apigateway"] + description = "Base tags (key:value format) to add to this type of check (combined with `local.tags` and `var.additional_tags`, generally you should not change this)" + type = list(string) +} + +######################################## +# Disk Space +######################################## +variable "disk_space_enabled" { + description = "Flag to enable Free diskspace monitor" + type = string + default = "true" +} + +variable "disk_space_time_aggregator" { + description = "Monitor aggregator for Free diskspace [available values: min, max or avg]" + type = string + default = "max" +} + +variable "disk_space_timeframe" { + description = "Monitor timeframe for Free diskspace [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_5m" +} + +variable "disk_space_threshold_warning" { + description = "Free disk space warning threshold" + type = number + default = 80 +} + +variable "disk_space_threshold_critical" { + description = "Free disk space critical threshold" + type = number + default = 90 +} + +######################################## +# Disk Space Forecast +######################################## +variable "disk_space_forecast_enabled" { + description = "Flag to enable Free diskspace forecast monitor" + type = string + default = "true" +} + +variable "disk_space_forecast_time_aggregator" { + description = "Monitor aggregator for Free diskspace forecast [available values: min, max or avg]" + type = string + default = "max" +} + +variable "disk_space_forecast_timeframe" { + description = "Monitor timeframe for Free diskspace forecast [available values: `next_12h`, `next_#d` (1, 2, or 3), `next_#w` (1 or 2) or `next_#mo` (1, 2 or 3)]" + type = string + default = "next_1w" +} + +variable "disk_space_forecast_algorithm" { + description = "Algorithm for the Free diskspace Forecast monitor [available values: `linear` or `seasonal`]" + type = string + default = "linear" +} + +variable "disk_space_forecast_deviations" { + description = "Deviations for the Free diskspace Forecast monitor [available values: `1`, `2`, `3`, `4` or `5`]" + type = string + default = 1 +} + +variable "disk_space_forecast_interval" { + description = "Interval for the Free diskspace Forecast monitor [available values: `30m`, `60m` or `120m`]" + type = string + default = "60m" +} + +variable "disk_space_forecast_linear_history" { + description = "History for the Free diskspace Forecast monitor [available values: `12h`, `#d` (1, 2, or 3), `#w` (1, or 2) or `#mo` (1, 2 or 3)]" + type = string + default = "1w" +} + +variable "disk_space_forecast_linear_model" { + description = "Model for the Free diskspace Forecast monitor [available values: `default`, `simple` or `reactive`]" + type = string + default = "default" +} + +variable "disk_space_forecast_seasonal_seasonality" { + description = "Seasonality for the Free diskspace Forecast monitor" + type = string + default = "weekly" +} + +variable "disk_space_forecast_threshold_critical_recovery" { + description = "Free disk space forecast recovery threshold" + type = number + default = 72 +} + +variable "disk_space_forecast_threshold_critical" { + description = "Free disk space forecast critical threshold" + type = number + default = 80 +} + +######################################## +# Disk Inodes +######################################## +variable "disk_inodes_enabled" { + description = "Flag to enable Free disk inodes monitor" + type = string + default = "true" +} + +variable "disk_inodes_time_aggregator" { + description = "Monitor aggregator for Free disk inodes [available values: min, max or avg]" + type = string + default = "min" +} + +variable "disk_inodes_timeframe" { + description = "Monitor timeframe for Free disk inodes [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_5m" +} + +variable "disk_inodes_threshold_warning" { + description = "Free disk space warning threshold" + type = number + default = 90 +} + +variable "disk_inodes_threshold_critical" { + description = "Free disk space critical threshold" + type = number + default = 95 +} diff --git a/host/disk/versions.tf b/host/disk/versions.tf new file mode 120000 index 0000000..cbeda73 --- /dev/null +++ b/host/disk/versions.tf @@ -0,0 +1 @@ +../../common/versions.tf \ No newline at end of file diff --git a/host/memory/.terraform.lock.hcl b/host/memory/.terraform.lock.hcl new file mode 100644 index 0000000..0791549 --- /dev/null +++ b/host/memory/.terraform.lock.hcl @@ -0,0 +1,44 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/datadog/datadog" { + version = "3.39.0" + constraints = ">= 3.37.0" + hashes = [ + "h1:6rSVMI3RBWRHA70wH8eIGyzH1IZHu1hwdixT+UbD9S0=", + "zh:3389298e5116232edd1c6e64c7a83f696d5663f1a1c7da5bb32be3472faa4a9e", + "zh:39b5d91e58372716d9c7d4869e8391612d3799a936f7c94cc9b3efbe08f4a6d9", + "zh:4a077105269fd187fc751a390d3e894092a2dd1b44e7e76ff9923968d971dc6a", + "zh:4ddfc1d5839eb4b88bdf3c620bc7f3a94c850d5e5a21b8c818bba63c4f264d24", + "zh:65d076f03bbaa2782c7487411bda925c14ee4841aefd0eda5442c5e4e73856cd", + "zh:7e357a7fe969c7ccbcb755799b95612f9e0bd835cfd993b05b5303c6dde1ffbd", + "zh:a1f11d390762eddaeef701c9d2747c8a9e113afa7fe7284210d62a21d1113f6e", + "zh:a7a17b04e66d7b13dca6240dab393a3dffec4429fc685fdeabd7113064cd8331", + "zh:af60a0062adcf35d89b159a45f1ce77ad8ea196bccaea1d1e17ab8caffbe859a", + "zh:bf8e5836cb7cbc82fae10b874d137fd5e804c7308ce05c76eaebee6e5b26123e", + "zh:c1c4742ea3be506124a494bdc9d2f4ec777e98e0ad6add9b6e81d4703dc4ac1e", + "zh:d121a6fcb947c674026b35751f10ffe4941ab3c163d4c0f210d053e0984f11e6", + "zh:d99b1c20a343c4cf7c4a2117e9b29eb8056d9f481c52d19500c037c8b6e87878", + "zh:e1338c712edb4a87b7ebeb9bd4f7ae72dc3b7574fdac45bc8e2d0cef784e0597", + ] +} + +provider "registry.terraform.io/hashicorp/null" { + version = "3.2.2" + constraints = ">= 3.1.0" + hashes = [ + "h1:IMVAUHKoydFrlPrl9OzasDnw/8ntZFerCC9iXw1rXQY=", + "zh:3248aae6a2198f3ec8394218d05bd5e42be59f43a3a7c0b71c66ec0df08b69e7", + "zh:32b1aaa1c3013d33c245493f4a65465eab9436b454d250102729321a44c8ab9a", + "zh:38eff7e470acb48f66380a73a5c7cdd76cc9b9c9ba9a7249c7991488abe22fe3", + "zh:4c2f1faee67af104f5f9e711c4574ff4d298afaa8a420680b0cb55d7bbc65606", + "zh:544b33b757c0b954dbb87db83a5ad921edd61f02f1dc86c6186a5ea86465b546", + "zh:696cf785090e1e8cf1587499516b0494f47413b43cb99877ad97f5d0de3dc539", + "zh:6e301f34757b5d265ae44467d95306d61bef5e41930be1365f5a8dcf80f59452", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:913a929070c819e59e94bb37a2a253c228f83921136ff4a7aa1a178c7cce5422", + "zh:aa9015926cd152425dbf86d1abdbc74bfe0e1ba3d26b3db35051d7b9ca9f72ae", + "zh:bb04798b016e1e1d49bcc76d62c53b56c88c63d6f2dfe38821afef17c416a0e1", + "zh:c23084e1b23577de22603cff752e59128d83cfecc2e6819edadd8cf7a10af11e", + ] +} diff --git a/host/memory/README.md b/host/memory/README.md new file mode 100644 index 0000000..e69de29 diff --git a/host/memory/common.tf b/host/memory/common.tf new file mode 120000 index 0000000..47c0063 --- /dev/null +++ b/host/memory/common.tf @@ -0,0 +1 @@ +../../common/common.tf \ No newline at end of file diff --git a/host/memory/main.tf b/host/memory/main.tf new file mode 100644 index 0000000..129bd45 --- /dev/null +++ b/host/memory/main.tf @@ -0,0 +1,39 @@ +locals { + # these must be defined but do not need to be overridden + monitor_alert_default_priority = null + monitor_warn_default_priority = null + monitor_nodata_default_priority = null + + title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] " + title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})" +} + +resource "datadog_monitor" "memory" { + count = var.memory_enabled ? 1 : 0 + + name = join("", [local.title_prefix, "Usable Memory - {{host.name}}", local.title_suffix]) + message = local.query_alert_base_message + tags = concat(local.common_tags, var.base_tags, var.additional_tags) + type = "query alert" + + query = < ${var.swap_threshold_critical} + EOQ + + evaluation_delay = var.evaluation_delay + new_group_delay = var.new_group_delay + notify_no_data = false + renotify_interval = 0 + notify_audit = false + timeout_h = var.timeout_h + include_tags = true + require_full_window = true + + monitor_thresholds { + warning = var.swap_threshold_warning + critical = var.swap_threshold_critical + } +} diff --git a/host/swap/variables.tf b/host/swap/variables.tf new file mode 100644 index 0000000..16e22b7 --- /dev/null +++ b/host/swap/variables.tf @@ -0,0 +1,47 @@ +######################################## +# Global variables +######################################## +variable "additional_tags" { + default = [] + description = "Additional tags (key:value format) to add to this type of check (combined with `local.tags` and `var.base_tags`)" + type = list(string) +} + +variable "base_tags" { + default = ["resource:apigateway"] + description = "Base tags (key:value format) to add to this type of check (combined with `local.tags` and `var.additional_tags`, generally you should not change this)" + type = list(string) +} + +######################################## +# Swap +######################################## +variable "swap_enabled" { + description = "Flag to enable Swap monitor" + type = string + default = "true" +} + +variable "swap_time_aggregator" { + description = "Monitor aggregator for Free Swap [available values: min, max or avg]" + type = string + default = "max" +} + +variable "swap_timeframe" { + description = "Monitor timeframe for Free Swap [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_5m" +} + +variable "swap_threshold_warning" { + description = "Free Swap warning threshold as percentage" + type = number + default = 0.3 +} + +variable "swap_threshold_critical" { + description = "Free Swap critical threshold as percentage" + type = number + default = 0.1 +} diff --git a/host/swap/versions.tf b/host/swap/versions.tf new file mode 120000 index 0000000..cbeda73 --- /dev/null +++ b/host/swap/versions.tf @@ -0,0 +1 @@ +../../common/versions.tf \ No newline at end of file From b301686eb741bc18f2b338f58b1042d32dc25d6a Mon Sep 17 00:00:00 2001 From: Lalo Galvan <106835113+lalo-galvan@users.noreply.github.com> Date: Thu, 6 Jun 2024 14:11:44 -0600 Subject: [PATCH 2/6] update tf docs --- host/agent/README.md | 65 +++++++++++++++++++++++++++++++ host/cpu/README.md | 70 +++++++++++++++++++++++++++++++++ host/disk/README.md | 87 ++++++++++++++++++++++++++++++++++++++++++ host/memory/README.md | 69 +++++++++++++++++++++++++++++++++ host/process/README.md | 70 +++++++++++++++++++++++++++++++++ host/swap/README.md | 69 +++++++++++++++++++++++++++++++++ 6 files changed, 430 insertions(+) diff --git a/host/agent/README.md b/host/agent/README.md index e69de29..b71562a 100644 --- a/host/agent/README.md +++ b/host/agent/README.md @@ -0,0 +1,65 @@ +# terraform-datadog-monitor/host/agent + +Configures monitor for Unreachable Host. + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | ~> 1.5 | +| [datadog](#requirement\_datadog) | >= 3.37 | +| [null](#requirement\_null) | >= 3.1.0 | + +## Providers + +| Name | Version | +|------|---------| +| [datadog](#provider\_datadog) | 3.39.0 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [datadog_monitor.host_unreachable](https://registry.terraform.io/providers/datadog/datadog/latest/docs/resources/monitor) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [additional\_tags](#input\_additional\_tags) | Additional tags (key:value format) to add to this type of check (combined with `local.tags` and `var.base_tags`) | `list(string)` | `[]` | no | +| [alert\_critical\_priority](#input\_alert\_critical\_priority) | Priority for alerts within critical threshold (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | +| [alert\_message](#input\_alert\_message) | Message to prepend to alert notifications | `string` | `"Alert"` | no | +| [alert\_nodata\_priority](#input\_alert\_nodata\_priority) | Priority for alerts within warning threshold (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | +| [base\_tags](#input\_base\_tags) | Base tags (key:value format) to add to this type of check (combined with `local.tags` and `var.additional_tags`, generally you should not change this) | `list(string)` |
[
"resource:apigateway"
]
| no | +| [cost\_center](#input\_cost\_center) | Cost Center of the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [dashboard\_link](#input\_dashboard\_link) | Dashboard link to include in message | `string` | `null` | no | +| [env](#input\_env) | Environment the monitored resource is in (leave blank to omit tag) | `string` | n/a | yes | +| [evaluation\_delay](#input\_evaluation\_delay) | Monitor evaluation delay (see [https://docs.datadoghq.com/monitors/configuration/?tab=thresholdalert#set-alert-conditions](Datadog Docs)) | `number` | `900` | no | +| [host\_unreachable\_enabled](#input\_host\_unreachable\_enabled) | Flag to enable Host unreachable monitor | `bool` | `true` | no | +| [monitor\_exclude\_tags](#input\_monitor\_exclude\_tags) | Tags to be excluded in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | +| [monitor\_include\_tags](#input\_monitor\_include\_tags) | Tags to be included in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | +| [new\_group\_delay](#input\_new\_group\_delay) | Delay in seconds before generating alerts for a new resource | `number` | `300` | no | +| [notify\_alert\_override](#input\_notify\_alert\_override) | List of notifications for alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_default](#input\_notify\_default) | List of alert notifications (can be overridden based on alert type) | `list(string)` | n/a | yes | +| [notify\_no\_data](#input\_notify\_no\_data) | Alert if no matching data is found | `bool` | `false` | no | +| [notify\_nodata\_override](#input\_notify\_nodata\_override) | List of notifications for no data (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_recovery\_override](#input\_notify\_recovery\_override) | List of notifications for alert recovery (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_warn\_override](#input\_notify\_warn\_override) | List of notifications for alerts in warning threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `0` | no | +| [runbook\_link](#input\_runbook\_link) | Runbook link to include in message | `string` | `null` | no | +| [service](#input\_service) | Service associated with the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [team](#input\_team) | Team supporting the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [timeout\_h](#input\_timeout\_h) | Auto-resolve alert in specified hours if condition no longer matches | `number` | `0` | no | +| [title\_prefix](#input\_title\_prefix) | Prefix all alerts with specified value in brackets | `string` | `null` | no | +| [title\_suffix](#input\_title\_suffix) | Suffix all alerts with specified value in parenthesis | `string` | `null` | no | +| [warn\_priority](#input\_warn\_priority) | Priority for alerts with no data (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | + +## Outputs + +No outputs. + diff --git a/host/cpu/README.md b/host/cpu/README.md index e69de29..f52bd4b 100644 --- a/host/cpu/README.md +++ b/host/cpu/README.md @@ -0,0 +1,70 @@ +# terraform-datadog-monitor/host/cpu + +Configures monitor for CPU Utilization. + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | ~> 1.5 | +| [datadog](#requirement\_datadog) | >= 3.37 | +| [null](#requirement\_null) | >= 3.1.0 | + +## Providers + +| Name | Version | +|------|---------| +| [datadog](#provider\_datadog) | 3.39.0 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [datadog_monitor.cpu_utilization](https://registry.terraform.io/providers/datadog/datadog/latest/docs/resources/monitor) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [additional\_tags](#input\_additional\_tags) | Additional tags (key:value format) to add to this type of check (combined with `local.tags` and `var.base_tags`) | `list(string)` | `[]` | no | +| [alert\_critical\_priority](#input\_alert\_critical\_priority) | Priority for alerts within critical threshold (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | +| [alert\_message](#input\_alert\_message) | Message to prepend to alert notifications | `string` | `"Alert"` | no | +| [alert\_nodata\_priority](#input\_alert\_nodata\_priority) | Priority for alerts within warning threshold (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | +| [base\_tags](#input\_base\_tags) | Base tags (key:value format) to add to this type of check (combined with `local.tags` and `var.additional_tags`, generally you should not change this) | `list(string)` |
[
"resource:apigateway"
]
| no | +| [cost\_center](#input\_cost\_center) | Cost Center of the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [cpu\_utilization\_enabled](#input\_cpu\_utilization\_enabled) | Flag to enable CPU Utilitzation monitor | `bool` | `true` | no | +| [cpu\_utilization\_no\_data\_window](#input\_cpu\_utilization\_no\_data\_window) | No data threshold (in minutes, 0 to disable) | `number` | `10` | no | +| [cpu\_utilization\_threshold\_critical](#input\_cpu\_utilization\_threshold\_critical) | Critical threshold (percent) | `number` | `90` | no | +| [cpu\_utilization\_threshold\_warning](#input\_cpu\_utilization\_threshold\_warning) | Warning threshold (percent) | `number` | `80` | no | +| [cpu\_utilization\_time\_aggregator](#input\_cpu\_utilization\_time\_aggregator) | Monitor aggregator for CPU high [available values: min, max or avg] | `string` | `"min"` | no | +| [cpu\_utilization\_timeframe](#input\_cpu\_utilization\_timeframe) | Monitor timeframe for CPU high [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_1h"` | no | +| [dashboard\_link](#input\_dashboard\_link) | Dashboard link to include in message | `string` | `null` | no | +| [env](#input\_env) | Environment the monitored resource is in (leave blank to omit tag) | `string` | n/a | yes | +| [evaluation\_delay](#input\_evaluation\_delay) | Monitor evaluation delay (see [https://docs.datadoghq.com/monitors/configuration/?tab=thresholdalert#set-alert-conditions](Datadog Docs)) | `number` | `900` | no | +| [monitor\_exclude\_tags](#input\_monitor\_exclude\_tags) | Tags to be excluded in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | +| [monitor\_include\_tags](#input\_monitor\_include\_tags) | Tags to be included in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | +| [new\_group\_delay](#input\_new\_group\_delay) | Delay in seconds before generating alerts for a new resource | `number` | `300` | no | +| [notify\_alert\_override](#input\_notify\_alert\_override) | List of notifications for alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_default](#input\_notify\_default) | List of alert notifications (can be overridden based on alert type) | `list(string)` | n/a | yes | +| [notify\_no\_data](#input\_notify\_no\_data) | Alert if no matching data is found | `bool` | `false` | no | +| [notify\_nodata\_override](#input\_notify\_nodata\_override) | List of notifications for no data (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_recovery\_override](#input\_notify\_recovery\_override) | List of notifications for alert recovery (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_warn\_override](#input\_notify\_warn\_override) | List of notifications for alerts in warning threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `0` | no | +| [runbook\_link](#input\_runbook\_link) | Runbook link to include in message | `string` | `null` | no | +| [service](#input\_service) | Service associated with the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [team](#input\_team) | Team supporting the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [timeout\_h](#input\_timeout\_h) | Auto-resolve alert in specified hours if condition no longer matches | `number` | `0` | no | +| [title\_prefix](#input\_title\_prefix) | Prefix all alerts with specified value in brackets | `string` | `null` | no | +| [title\_suffix](#input\_title\_suffix) | Suffix all alerts with specified value in parenthesis | `string` | `null` | no | +| [warn\_priority](#input\_warn\_priority) | Priority for alerts with no data (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | + +## Outputs + +No outputs. + diff --git a/host/disk/README.md b/host/disk/README.md index e69de29..9f8c82a 100644 --- a/host/disk/README.md +++ b/host/disk/README.md @@ -0,0 +1,87 @@ +# terraform-datadog-monitor/host/disk + +Configures monitor for host disk space (forecast and actual) and disk inodes. + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | ~> 1.5 | +| [datadog](#requirement\_datadog) | >= 3.37 | +| [null](#requirement\_null) | >= 3.1.0 | + +## Providers + +| Name | Version | +|------|---------| +| [datadog](#provider\_datadog) | 3.39.0 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [datadog_monitor.disk_inodes](https://registry.terraform.io/providers/datadog/datadog/latest/docs/resources/monitor) | resource | +| [datadog_monitor.disk_space](https://registry.terraform.io/providers/datadog/datadog/latest/docs/resources/monitor) | resource | +| [datadog_monitor.disk_space_forecast](https://registry.terraform.io/providers/datadog/datadog/latest/docs/resources/monitor) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [additional\_tags](#input\_additional\_tags) | Additional tags (key:value format) to add to this type of check (combined with `local.tags` and `var.base_tags`) | `list(string)` | `[]` | no | +| [alert\_critical\_priority](#input\_alert\_critical\_priority) | Priority for alerts within critical threshold (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | +| [alert\_message](#input\_alert\_message) | Message to prepend to alert notifications | `string` | `"Alert"` | no | +| [alert\_nodata\_priority](#input\_alert\_nodata\_priority) | Priority for alerts within warning threshold (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | +| [base\_tags](#input\_base\_tags) | Base tags (key:value format) to add to this type of check (combined with `local.tags` and `var.additional_tags`, generally you should not change this) | `list(string)` |
[
"resource:apigateway"
]
| no | +| [cost\_center](#input\_cost\_center) | Cost Center of the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [dashboard\_link](#input\_dashboard\_link) | Dashboard link to include in message | `string` | `null` | no | +| [disk\_inodes\_enabled](#input\_disk\_inodes\_enabled) | Flag to enable Free disk inodes monitor | `string` | `"true"` | no | +| [disk\_inodes\_threshold\_critical](#input\_disk\_inodes\_threshold\_critical) | Free disk space critical threshold | `number` | `95` | no | +| [disk\_inodes\_threshold\_warning](#input\_disk\_inodes\_threshold\_warning) | Free disk space warning threshold | `number` | `90` | no | +| [disk\_inodes\_time\_aggregator](#input\_disk\_inodes\_time\_aggregator) | Monitor aggregator for Free disk inodes [available values: min, max or avg] | `string` | `"min"` | no | +| [disk\_inodes\_timeframe](#input\_disk\_inodes\_timeframe) | Monitor timeframe for Free disk inodes [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_5m"` | no | +| [disk\_space\_enabled](#input\_disk\_space\_enabled) | Flag to enable Free diskspace monitor | `string` | `"true"` | no | +| [disk\_space\_forecast\_algorithm](#input\_disk\_space\_forecast\_algorithm) | Algorithm for the Free diskspace Forecast monitor [available values: `linear` or `seasonal`] | `string` | `"linear"` | no | +| [disk\_space\_forecast\_deviations](#input\_disk\_space\_forecast\_deviations) | Deviations for the Free diskspace Forecast monitor [available values: `1`, `2`, `3`, `4` or `5`] | `string` | `1` | no | +| [disk\_space\_forecast\_enabled](#input\_disk\_space\_forecast\_enabled) | Flag to enable Free diskspace forecast monitor | `string` | `"true"` | no | +| [disk\_space\_forecast\_interval](#input\_disk\_space\_forecast\_interval) | Interval for the Free diskspace Forecast monitor [available values: `30m`, `60m` or `120m`] | `string` | `"60m"` | no | +| [disk\_space\_forecast\_linear\_history](#input\_disk\_space\_forecast\_linear\_history) | History for the Free diskspace Forecast monitor [available values: `12h`, `#d` (1, 2, or 3), `#w` (1, or 2) or `#mo` (1, 2 or 3)] | `string` | `"1w"` | no | +| [disk\_space\_forecast\_linear\_model](#input\_disk\_space\_forecast\_linear\_model) | Model for the Free diskspace Forecast monitor [available values: `default`, `simple` or `reactive`] | `string` | `"default"` | no | +| [disk\_space\_forecast\_seasonal\_seasonality](#input\_disk\_space\_forecast\_seasonal\_seasonality) | Seasonality for the Free diskspace Forecast monitor | `string` | `"weekly"` | no | +| [disk\_space\_forecast\_threshold\_critical](#input\_disk\_space\_forecast\_threshold\_critical) | Free disk space forecast critical threshold | `number` | `80` | no | +| [disk\_space\_forecast\_threshold\_critical\_recovery](#input\_disk\_space\_forecast\_threshold\_critical\_recovery) | Free disk space forecast recovery threshold | `number` | `72` | no | +| [disk\_space\_forecast\_time\_aggregator](#input\_disk\_space\_forecast\_time\_aggregator) | Monitor aggregator for Free diskspace forecast [available values: min, max or avg] | `string` | `"max"` | no | +| [disk\_space\_forecast\_timeframe](#input\_disk\_space\_forecast\_timeframe) | Monitor timeframe for Free diskspace forecast [available values: `next_12h`, `next_#d` (1, 2, or 3), `next_#w` (1 or 2) or `next_#mo` (1, 2 or 3)] | `string` | `"next_1w"` | no | +| [disk\_space\_threshold\_critical](#input\_disk\_space\_threshold\_critical) | Free disk space critical threshold | `number` | `90` | no | +| [disk\_space\_threshold\_warning](#input\_disk\_space\_threshold\_warning) | Free disk space warning threshold | `number` | `80` | no | +| [disk\_space\_time\_aggregator](#input\_disk\_space\_time\_aggregator) | Monitor aggregator for Free diskspace [available values: min, max or avg] | `string` | `"max"` | no | +| [disk\_space\_timeframe](#input\_disk\_space\_timeframe) | Monitor timeframe for Free diskspace [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_5m"` | no | +| [env](#input\_env) | Environment the monitored resource is in (leave blank to omit tag) | `string` | n/a | yes | +| [evaluation\_delay](#input\_evaluation\_delay) | Monitor evaluation delay (see [https://docs.datadoghq.com/monitors/configuration/?tab=thresholdalert#set-alert-conditions](Datadog Docs)) | `number` | `900` | no | +| [monitor\_exclude\_tags](#input\_monitor\_exclude\_tags) | Tags to be excluded in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | +| [monitor\_include\_tags](#input\_monitor\_include\_tags) | Tags to be included in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | +| [new\_group\_delay](#input\_new\_group\_delay) | Delay in seconds before generating alerts for a new resource | `number` | `300` | no | +| [notify\_alert\_override](#input\_notify\_alert\_override) | List of notifications for alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_default](#input\_notify\_default) | List of alert notifications (can be overridden based on alert type) | `list(string)` | n/a | yes | +| [notify\_no\_data](#input\_notify\_no\_data) | Alert if no matching data is found | `bool` | `false` | no | +| [notify\_nodata\_override](#input\_notify\_nodata\_override) | List of notifications for no data (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_recovery\_override](#input\_notify\_recovery\_override) | List of notifications for alert recovery (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_warn\_override](#input\_notify\_warn\_override) | List of notifications for alerts in warning threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `0` | no | +| [runbook\_link](#input\_runbook\_link) | Runbook link to include in message | `string` | `null` | no | +| [service](#input\_service) | Service associated with the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [team](#input\_team) | Team supporting the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [timeout\_h](#input\_timeout\_h) | Auto-resolve alert in specified hours if condition no longer matches | `number` | `0` | no | +| [title\_prefix](#input\_title\_prefix) | Prefix all alerts with specified value in brackets | `string` | `null` | no | +| [title\_suffix](#input\_title\_suffix) | Suffix all alerts with specified value in parenthesis | `string` | `null` | no | +| [warn\_priority](#input\_warn\_priority) | Priority for alerts with no data (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | + +## Outputs + +No outputs. + diff --git a/host/memory/README.md b/host/memory/README.md index e69de29..c4cb6c6 100644 --- a/host/memory/README.md +++ b/host/memory/README.md @@ -0,0 +1,69 @@ +# terraform-datadog-monitor/host/memory + +Configures monitor for Host Memory Utilization. + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | ~> 1.5 | +| [datadog](#requirement\_datadog) | >= 3.37 | +| [null](#requirement\_null) | >= 3.1.0 | + +## Providers + +| Name | Version | +|------|---------| +| [datadog](#provider\_datadog) | 3.39.0 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [datadog_monitor.memory](https://registry.terraform.io/providers/datadog/datadog/latest/docs/resources/monitor) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [additional\_tags](#input\_additional\_tags) | Additional tags (key:value format) to add to this type of check (combined with `local.tags` and `var.base_tags`) | `list(string)` | `[]` | no | +| [alert\_critical\_priority](#input\_alert\_critical\_priority) | Priority for alerts within critical threshold (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | +| [alert\_message](#input\_alert\_message) | Message to prepend to alert notifications | `string` | `"Alert"` | no | +| [alert\_nodata\_priority](#input\_alert\_nodata\_priority) | Priority for alerts within warning threshold (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | +| [base\_tags](#input\_base\_tags) | Base tags (key:value format) to add to this type of check (combined with `local.tags` and `var.additional_tags`, generally you should not change this) | `list(string)` |
[
"resource:apigateway"
]
| no | +| [cost\_center](#input\_cost\_center) | Cost Center of the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [dashboard\_link](#input\_dashboard\_link) | Dashboard link to include in message | `string` | `null` | no | +| [env](#input\_env) | Environment the monitored resource is in (leave blank to omit tag) | `string` | n/a | yes | +| [evaluation\_delay](#input\_evaluation\_delay) | Monitor evaluation delay (see [https://docs.datadoghq.com/monitors/configuration/?tab=thresholdalert#set-alert-conditions](Datadog Docs)) | `number` | `900` | no | +| [memory\_enabled](#input\_memory\_enabled) | Flag to enable Free memory monitor | `string` | `"true"` | no | +| [memory\_threshold\_critical](#input\_memory\_threshold\_critical) | Free disk space critical threshold | `number` | `5` | no | +| [memory\_threshold\_warning](#input\_memory\_threshold\_warning) | Free disk space warning threshold | `number` | `10` | no | +| [memory\_time\_aggregator](#input\_memory\_time\_aggregator) | Monitor aggregator for Free memory [available values: min, max or avg] | `string` | `"max"` | no | +| [memory\_timeframe](#input\_memory\_timeframe) | Monitor timeframe for Free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_5m"` | no | +| [monitor\_exclude\_tags](#input\_monitor\_exclude\_tags) | Tags to be excluded in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | +| [monitor\_include\_tags](#input\_monitor\_include\_tags) | Tags to be included in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | +| [new\_group\_delay](#input\_new\_group\_delay) | Delay in seconds before generating alerts for a new resource | `number` | `300` | no | +| [notify\_alert\_override](#input\_notify\_alert\_override) | List of notifications for alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_default](#input\_notify\_default) | List of alert notifications (can be overridden based on alert type) | `list(string)` | n/a | yes | +| [notify\_no\_data](#input\_notify\_no\_data) | Alert if no matching data is found | `bool` | `false` | no | +| [notify\_nodata\_override](#input\_notify\_nodata\_override) | List of notifications for no data (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_recovery\_override](#input\_notify\_recovery\_override) | List of notifications for alert recovery (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_warn\_override](#input\_notify\_warn\_override) | List of notifications for alerts in warning threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `0` | no | +| [runbook\_link](#input\_runbook\_link) | Runbook link to include in message | `string` | `null` | no | +| [service](#input\_service) | Service associated with the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [team](#input\_team) | Team supporting the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [timeout\_h](#input\_timeout\_h) | Auto-resolve alert in specified hours if condition no longer matches | `number` | `0` | no | +| [title\_prefix](#input\_title\_prefix) | Prefix all alerts with specified value in brackets | `string` | `null` | no | +| [title\_suffix](#input\_title\_suffix) | Suffix all alerts with specified value in parenthesis | `string` | `null` | no | +| [warn\_priority](#input\_warn\_priority) | Priority for alerts with no data (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | + +## Outputs + +No outputs. + diff --git a/host/process/README.md b/host/process/README.md index e69de29..93a1bb8 100644 --- a/host/process/README.md +++ b/host/process/README.md @@ -0,0 +1,70 @@ +# terraform-datadog-monitor/host/process + +Configures monitor for Processes on Host. + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | ~> 1.5 | +| [datadog](#requirement\_datadog) | >= 3.37 | +| [null](#requirement\_null) | >= 3.1.0 | + +## Providers + +| Name | Version | +|------|---------| +| [datadog](#provider\_datadog) | 3.39.0 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [datadog_monitor.process_check](https://registry.terraform.io/providers/datadog/datadog/latest/docs/resources/monitor) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [additional\_tags](#input\_additional\_tags) | Additional tags (key:value format) to add to this type of check (combined with `local.tags` and `var.base_tags`) | `list(string)` | `[]` | no | +| [alert\_critical\_priority](#input\_alert\_critical\_priority) | Priority for alerts within critical threshold (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | +| [alert\_message](#input\_alert\_message) | Message to prepend to alert notifications | `string` | `"Alert"` | no | +| [alert\_nodata\_priority](#input\_alert\_nodata\_priority) | Priority for alerts within warning threshold (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | +| [base\_tags](#input\_base\_tags) | Base tags (key:value format) to add to this type of check (combined with `local.tags` and `var.additional_tags`, generally you should not change this) | `list(string)` |
[
"resource:apigateway"
]
| no | +| [cost\_center](#input\_cost\_center) | Cost Center of the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [dashboard\_link](#input\_dashboard\_link) | Dashboard link to include in message | `string` | `null` | no | +| [env](#input\_env) | Environment the monitored resource is in (leave blank to omit tag) | `string` | n/a | yes | +| [evaluation\_delay](#input\_evaluation\_delay) | Monitor evaluation delay (see [https://docs.datadoghq.com/monitors/configuration/?tab=thresholdalert#set-alert-conditions](Datadog Docs)) | `number` | `900` | no | +| [monitor\_exclude\_tags](#input\_monitor\_exclude\_tags) | Tags to be excluded in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | +| [monitor\_include\_tags](#input\_monitor\_include\_tags) | Tags to be included in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | +| [new\_group\_delay](#input\_new\_group\_delay) | Delay in seconds before generating alerts for a new resource | `number` | `300` | no | +| [notify\_alert\_override](#input\_notify\_alert\_override) | List of notifications for alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_default](#input\_notify\_default) | List of alert notifications (can be overridden based on alert type) | `list(string)` | n/a | yes | +| [notify\_no\_data](#input\_notify\_no\_data) | Alert if no matching data is found | `bool` | `false` | no | +| [notify\_nodata\_override](#input\_notify\_nodata\_override) | List of notifications for no data (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_recovery\_override](#input\_notify\_recovery\_override) | List of notifications for alert recovery (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_warn\_override](#input\_notify\_warn\_override) | List of notifications for alerts in warning threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [process\_check\_enabled](#input\_process\_check\_enabled) | Flag to enable Process Check monitor | `string` | `"true"` | no | +| [process\_check\_name](#input\_process\_check\_name) | Name of Process for Process Check Monitor | `string` | `""` | no | +| [process\_check\_threshold\_critical](#input\_process\_check\_threshold\_critical) | Proccess Check critical threshold | `number` | `5` | no | +| [process\_check\_threshold\_ok](#input\_process\_check\_threshold\_ok) | Proccess Check ok threshold | `number` | `1` | no | +| [process\_check\_threshold\_warning](#input\_process\_check\_threshold\_warning) | Proccess Check warning threshold | `number` | `2` | no | +| [process\_check\_timeframe](#input\_process\_check\_timeframe) | Monitor timeframe for Process Check [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_5m"` | no | +| [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `0` | no | +| [runbook\_link](#input\_runbook\_link) | Runbook link to include in message | `string` | `null` | no | +| [service](#input\_service) | Service associated with the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [team](#input\_team) | Team supporting the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [timeout\_h](#input\_timeout\_h) | Auto-resolve alert in specified hours if condition no longer matches | `number` | `0` | no | +| [title\_prefix](#input\_title\_prefix) | Prefix all alerts with specified value in brackets | `string` | `null` | no | +| [title\_suffix](#input\_title\_suffix) | Suffix all alerts with specified value in parenthesis | `string` | `null` | no | +| [warn\_priority](#input\_warn\_priority) | Priority for alerts with no data (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | + +## Outputs + +No outputs. + diff --git a/host/swap/README.md b/host/swap/README.md index e69de29..4632a84 100644 --- a/host/swap/README.md +++ b/host/swap/README.md @@ -0,0 +1,69 @@ +# terraform-datadog-monitor/host/swap + +Configures monitor for Swap Memory Free. + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | ~> 1.5 | +| [datadog](#requirement\_datadog) | >= 3.37 | +| [null](#requirement\_null) | >= 3.1.0 | + +## Providers + +| Name | Version | +|------|---------| +| [datadog](#provider\_datadog) | 3.39.0 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [datadog_monitor.swap](https://registry.terraform.io/providers/datadog/datadog/latest/docs/resources/monitor) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [additional\_tags](#input\_additional\_tags) | Additional tags (key:value format) to add to this type of check (combined with `local.tags` and `var.base_tags`) | `list(string)` | `[]` | no | +| [alert\_critical\_priority](#input\_alert\_critical\_priority) | Priority for alerts within critical threshold (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | +| [alert\_message](#input\_alert\_message) | Message to prepend to alert notifications | `string` | `"Alert"` | no | +| [alert\_nodata\_priority](#input\_alert\_nodata\_priority) | Priority for alerts within warning threshold (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | +| [base\_tags](#input\_base\_tags) | Base tags (key:value format) to add to this type of check (combined with `local.tags` and `var.additional_tags`, generally you should not change this) | `list(string)` |
[
"resource:apigateway"
]
| no | +| [cost\_center](#input\_cost\_center) | Cost Center of the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [dashboard\_link](#input\_dashboard\_link) | Dashboard link to include in message | `string` | `null` | no | +| [env](#input\_env) | Environment the monitored resource is in (leave blank to omit tag) | `string` | n/a | yes | +| [evaluation\_delay](#input\_evaluation\_delay) | Monitor evaluation delay (see [https://docs.datadoghq.com/monitors/configuration/?tab=thresholdalert#set-alert-conditions](Datadog Docs)) | `number` | `900` | no | +| [monitor\_exclude\_tags](#input\_monitor\_exclude\_tags) | Tags to be excluded in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | +| [monitor\_include\_tags](#input\_monitor\_include\_tags) | Tags to be included in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | +| [new\_group\_delay](#input\_new\_group\_delay) | Delay in seconds before generating alerts for a new resource | `number` | `300` | no | +| [notify\_alert\_override](#input\_notify\_alert\_override) | List of notifications for alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_default](#input\_notify\_default) | List of alert notifications (can be overridden based on alert type) | `list(string)` | n/a | yes | +| [notify\_no\_data](#input\_notify\_no\_data) | Alert if no matching data is found | `bool` | `false` | no | +| [notify\_nodata\_override](#input\_notify\_nodata\_override) | List of notifications for no data (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_recovery\_override](#input\_notify\_recovery\_override) | List of notifications for alert recovery (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_warn\_override](#input\_notify\_warn\_override) | List of notifications for alerts in warning threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `0` | no | +| [runbook\_link](#input\_runbook\_link) | Runbook link to include in message | `string` | `null` | no | +| [service](#input\_service) | Service associated with the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [swap\_enabled](#input\_swap\_enabled) | Flag to enable Swap monitor | `string` | `"true"` | no | +| [swap\_threshold\_critical](#input\_swap\_threshold\_critical) | Free Swap critical threshold as percentage | `number` | `0.1` | no | +| [swap\_threshold\_warning](#input\_swap\_threshold\_warning) | Free Swap warning threshold as percentage | `number` | `0.3` | no | +| [swap\_time\_aggregator](#input\_swap\_time\_aggregator) | Monitor aggregator for Free Swap [available values: min, max or avg] | `string` | `"max"` | no | +| [swap\_timeframe](#input\_swap\_timeframe) | Monitor timeframe for Free Swap [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_5m"` | no | +| [team](#input\_team) | Team supporting the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [timeout\_h](#input\_timeout\_h) | Auto-resolve alert in specified hours if condition no longer matches | `number` | `0` | no | +| [title\_prefix](#input\_title\_prefix) | Prefix all alerts with specified value in brackets | `string` | `null` | no | +| [title\_suffix](#input\_title\_suffix) | Suffix all alerts with specified value in parenthesis | `string` | `null` | no | +| [warn\_priority](#input\_warn\_priority) | Priority for alerts with no data (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | + +## Outputs + +No outputs. + From 9b679f9cee55c0d463d0f8b1f945a1f4be66654e Mon Sep 17 00:00:00 2001 From: Lalo Galvan <106835113+lalo-galvan@users.noreply.github.com> Date: Thu, 13 Jun 2024 03:05:46 -0600 Subject: [PATCH 3/6] add num-process check --- host/num-process/README.md | 70 +++++++++++++++++++++++++++++++++++ host/num-process/common.tf | 1 + host/num-process/main.tf | 37 ++++++++++++++++++ host/num-process/variables.tf | 53 ++++++++++++++++++++++++++ host/num-process/versions.tf | 1 + 5 files changed, 162 insertions(+) create mode 100644 host/num-process/README.md create mode 120000 host/num-process/common.tf create mode 100644 host/num-process/main.tf create mode 100644 host/num-process/variables.tf create mode 120000 host/num-process/versions.tf diff --git a/host/num-process/README.md b/host/num-process/README.md new file mode 100644 index 0000000..b041dae --- /dev/null +++ b/host/num-process/README.md @@ -0,0 +1,70 @@ +# terraform-datadog-monitor/host/num-process + +Number of Processes Check to alert on stale or runaway processes. + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | ~> 1.5 | +| [datadog](#requirement\_datadog) | >= 3.37 | +| [null](#requirement\_null) | >= 3.1.0 | + +## Providers + +| Name | Version | +|------|---------| +| [datadog](#provider\_datadog) | >= 3.37 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [datadog_monitor.num_process_check](https://registry.terraform.io/providers/datadog/datadog/latest/docs/resources/monitor) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [additional\_tags](#input\_additional\_tags) | Additional tags (key:value format) to add to this type of check (combined with `local.tags` and `var.base_tags`) | `list(string)` | `[]` | no | +| [alert\_critical\_priority](#input\_alert\_critical\_priority) | Priority for alerts within critical threshold (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | +| [alert\_message](#input\_alert\_message) | Message to prepend to alert notifications | `string` | `"Alert"` | no | +| [alert\_nodata\_priority](#input\_alert\_nodata\_priority) | Priority for alerts within warning threshold (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | +| [base\_tags](#input\_base\_tags) | Base tags (key:value format) to add to this type of check (combined with `local.tags` and `var.additional_tags`, generally you should not change this) | `list(string)` |
[
"resource:apigateway"
]
| no | +| [cost\_center](#input\_cost\_center) | Cost Center of the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [dashboard\_link](#input\_dashboard\_link) | Dashboard link to include in message | `string` | `null` | no | +| [env](#input\_env) | Environment the monitored resource is in (leave blank to omit tag) | `string` | n/a | yes | +| [evaluation\_delay](#input\_evaluation\_delay) | Monitor evaluation delay (see [https://docs.datadoghq.com/monitors/configuration/?tab=thresholdalert#set-alert-conditions](Datadog Docs)) | `number` | `900` | no | +| [monitor\_exclude\_tags](#input\_monitor\_exclude\_tags) | Tags to be excluded in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | +| [monitor\_include\_tags](#input\_monitor\_include\_tags) | Tags to be included in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | +| [new\_group\_delay](#input\_new\_group\_delay) | Delay in seconds before generating alerts for a new resource | `number` | `300` | no | +| [notify\_alert\_override](#input\_notify\_alert\_override) | List of notifications for alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_default](#input\_notify\_default) | List of alert notifications (can be overridden based on alert type) | `list(string)` | n/a | yes | +| [notify\_no\_data](#input\_notify\_no\_data) | Alert if no matching data is found | `bool` | `false` | no | +| [notify\_nodata\_override](#input\_notify\_nodata\_override) | List of notifications for no data (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_recovery\_override](#input\_notify\_recovery\_override) | List of notifications for alert recovery (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_warn\_override](#input\_notify\_warn\_override) | List of notifications for alerts in warning threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [num\_process\_check\_enabled](#input\_num\_process\_check\_enabled) | Flag to enable Num Process Check monitor | `string` | `"true"` | no | +| [num\_process\_check\_name](#input\_num\_process\_check\_name) | Name of Process for Num Process Check Monitor | `string` | `""` | no | +| [num\_process\_check\_operator](#input\_num\_process\_check\_operator) | Operator for Num Proccess Check Query [available values: `<, >, <=, >=, =`] | `string` | `"<="` | no | +| [num\_process\_check\_threshold\_critical](#input\_num\_process\_check\_threshold\_critical) | Num Proccess Check critical threshold | `number` | `1` | no | +| [num\_process\_check\_threshold\_warning](#input\_num\_process\_check\_threshold\_warning) | Num Proccess Check warning threshold | `number` | `2` | no | +| [num\_process\_check\_timeframe](#input\_num\_process\_check\_timeframe) | Monitor timeframe for Num Process Check [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_5m"` | no | +| [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `0` | no | +| [runbook\_link](#input\_runbook\_link) | Runbook link to include in message | `string` | `null` | no | +| [service](#input\_service) | Service associated with the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [team](#input\_team) | Team supporting the monitored resource (leave blank to omit tag) | `string` | `null` | no | +| [timeout\_h](#input\_timeout\_h) | Auto-resolve alert in specified hours if condition no longer matches | `number` | `0` | no | +| [title\_prefix](#input\_title\_prefix) | Prefix all alerts with specified value in brackets | `string` | `null` | no | +| [title\_suffix](#input\_title\_suffix) | Suffix all alerts with specified value in parenthesis | `string` | `null` | no | +| [warn\_priority](#input\_warn\_priority) | Priority for alerts with no data (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | + +## Outputs + +No outputs. + diff --git a/host/num-process/common.tf b/host/num-process/common.tf new file mode 120000 index 0000000..47c0063 --- /dev/null +++ b/host/num-process/common.tf @@ -0,0 +1 @@ +../../common/common.tf \ No newline at end of file diff --git a/host/num-process/main.tf b/host/num-process/main.tf new file mode 100644 index 0000000..8c6f0b7 --- /dev/null +++ b/host/num-process/main.tf @@ -0,0 +1,37 @@ +locals { + # these must be defined but do not need to be overridden + monitor_alert_default_priority = null + monitor_warn_default_priority = null + monitor_nodata_default_priority = null + + title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] " + title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})" +} + +resource "datadog_monitor" "num_process_check" { + count = var.num_process_check_enabled ? 1 : 0 + + name = join("", [local.title_prefix, "Process Check - {{host.name}}", local.title_suffix]) + message = local.query_alert_base_message + tags = concat(local.common_tags, var.base_tags, var.additional_tags) + type = "query alert" + + evaluation_delay = var.evaluation_delay + new_group_delay = var.new_group_delay + notify_no_data = false + renotify_interval = 0 + notify_audit = false + timeout_h = var.timeout_h + include_tags = true + require_full_window = true + + query = < Date: Thu, 13 Jun 2024 03:06:15 -0600 Subject: [PATCH 4/6] add num-process check --- host/num-process/.terraform.lock.hcl | 44 ++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 host/num-process/.terraform.lock.hcl diff --git a/host/num-process/.terraform.lock.hcl b/host/num-process/.terraform.lock.hcl new file mode 100644 index 0000000..0791549 --- /dev/null +++ b/host/num-process/.terraform.lock.hcl @@ -0,0 +1,44 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/datadog/datadog" { + version = "3.39.0" + constraints = ">= 3.37.0" + hashes = [ + "h1:6rSVMI3RBWRHA70wH8eIGyzH1IZHu1hwdixT+UbD9S0=", + "zh:3389298e5116232edd1c6e64c7a83f696d5663f1a1c7da5bb32be3472faa4a9e", + "zh:39b5d91e58372716d9c7d4869e8391612d3799a936f7c94cc9b3efbe08f4a6d9", + "zh:4a077105269fd187fc751a390d3e894092a2dd1b44e7e76ff9923968d971dc6a", + "zh:4ddfc1d5839eb4b88bdf3c620bc7f3a94c850d5e5a21b8c818bba63c4f264d24", + "zh:65d076f03bbaa2782c7487411bda925c14ee4841aefd0eda5442c5e4e73856cd", + "zh:7e357a7fe969c7ccbcb755799b95612f9e0bd835cfd993b05b5303c6dde1ffbd", + "zh:a1f11d390762eddaeef701c9d2747c8a9e113afa7fe7284210d62a21d1113f6e", + "zh:a7a17b04e66d7b13dca6240dab393a3dffec4429fc685fdeabd7113064cd8331", + "zh:af60a0062adcf35d89b159a45f1ce77ad8ea196bccaea1d1e17ab8caffbe859a", + "zh:bf8e5836cb7cbc82fae10b874d137fd5e804c7308ce05c76eaebee6e5b26123e", + "zh:c1c4742ea3be506124a494bdc9d2f4ec777e98e0ad6add9b6e81d4703dc4ac1e", + "zh:d121a6fcb947c674026b35751f10ffe4941ab3c163d4c0f210d053e0984f11e6", + "zh:d99b1c20a343c4cf7c4a2117e9b29eb8056d9f481c52d19500c037c8b6e87878", + "zh:e1338c712edb4a87b7ebeb9bd4f7ae72dc3b7574fdac45bc8e2d0cef784e0597", + ] +} + +provider "registry.terraform.io/hashicorp/null" { + version = "3.2.2" + constraints = ">= 3.1.0" + hashes = [ + "h1:IMVAUHKoydFrlPrl9OzasDnw/8ntZFerCC9iXw1rXQY=", + "zh:3248aae6a2198f3ec8394218d05bd5e42be59f43a3a7c0b71c66ec0df08b69e7", + "zh:32b1aaa1c3013d33c245493f4a65465eab9436b454d250102729321a44c8ab9a", + "zh:38eff7e470acb48f66380a73a5c7cdd76cc9b9c9ba9a7249c7991488abe22fe3", + "zh:4c2f1faee67af104f5f9e711c4574ff4d298afaa8a420680b0cb55d7bbc65606", + "zh:544b33b757c0b954dbb87db83a5ad921edd61f02f1dc86c6186a5ea86465b546", + "zh:696cf785090e1e8cf1587499516b0494f47413b43cb99877ad97f5d0de3dc539", + "zh:6e301f34757b5d265ae44467d95306d61bef5e41930be1365f5a8dcf80f59452", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:913a929070c819e59e94bb37a2a253c228f83921136ff4a7aa1a178c7cce5422", + "zh:aa9015926cd152425dbf86d1abdbc74bfe0e1ba3d26b3db35051d7b9ca9f72ae", + "zh:bb04798b016e1e1d49bcc76d62c53b56c88c63d6f2dfe38821afef17c416a0e1", + "zh:c23084e1b23577de22603cff752e59128d83cfecc2e6819edadd8cf7a10af11e", + ] +} From cbf6d64cfd40bf700ecec1b8818d1674269f7517 Mon Sep 17 00:00:00 2001 From: Lalo Galvan <106835113+lalo-galvan@users.noreply.github.com> Date: Fri, 12 Jul 2024 01:37:47 -0600 Subject: [PATCH 5/6] fix spelling mistakes --- host/num-process/variables.tf | 6 +++--- host/process/variables.tf | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/host/num-process/variables.tf b/host/num-process/variables.tf index f68524b..81d1d3d 100644 --- a/host/num-process/variables.tf +++ b/host/num-process/variables.tf @@ -35,19 +35,19 @@ variable "num_process_check_timeframe" { } variable "num_process_check_threshold_warning" { - description = "Num Proccess Check warning threshold" + description = "Num Process Check warning threshold" type = number default = 2 } variable "num_process_check_threshold_critical" { - description = "Num Proccess Check critical threshold" + description = "Num Process Check critical threshold" type = number default = 1 } variable "num_process_check_operator" { - description = "Operator for Num Proccess Check Query [available values: `<, >, <=, >=, =`]" + description = "Operator for Num Process Check Query [available values: `<, >, <=, >=, =`]" type = string default = "<=" } diff --git a/host/process/variables.tf b/host/process/variables.tf index 586814d..e36e098 100644 --- a/host/process/variables.tf +++ b/host/process/variables.tf @@ -35,19 +35,19 @@ variable "process_check_timeframe" { } variable "process_check_threshold_warning" { - description = "Proccess Check warning threshold" + description = "Process Check warning threshold" type = number default = 2 } variable "process_check_threshold_critical" { - description = "Proccess Check critical threshold" + description = "Process Check critical threshold" type = number default = 5 } variable "process_check_threshold_ok" { - description = "Proccess Check ok threshold" + description = "Process Check ok threshold" type = number default = 1 } From 53810919c0c90ae8d05160d2d7f642d18c5bee5e Mon Sep 17 00:00:00 2001 From: Lalo Galvan <106835113+lalo-galvan@users.noreply.github.com> Date: Fri, 12 Jul 2024 01:40:10 -0600 Subject: [PATCH 6/6] fix spelling mistakes --- host/num-process/README.md | 8 ++++---- host/process/README.md | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/host/num-process/README.md b/host/num-process/README.md index b041dae..11ce261 100644 --- a/host/num-process/README.md +++ b/host/num-process/README.md @@ -15,7 +15,7 @@ Number of Processes Check to alert on stale or runaway processes. | Name | Version | |------|---------| -| [datadog](#provider\_datadog) | >= 3.37 | +| [datadog](#provider\_datadog) | 3.39.0 | ## Modules @@ -51,9 +51,9 @@ No modules. | [notify\_warn\_override](#input\_notify\_warn\_override) | List of notifications for alerts in warning threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [num\_process\_check\_enabled](#input\_num\_process\_check\_enabled) | Flag to enable Num Process Check monitor | `string` | `"true"` | no | | [num\_process\_check\_name](#input\_num\_process\_check\_name) | Name of Process for Num Process Check Monitor | `string` | `""` | no | -| [num\_process\_check\_operator](#input\_num\_process\_check\_operator) | Operator for Num Proccess Check Query [available values: `<, >, <=, >=, =`] | `string` | `"<="` | no | -| [num\_process\_check\_threshold\_critical](#input\_num\_process\_check\_threshold\_critical) | Num Proccess Check critical threshold | `number` | `1` | no | -| [num\_process\_check\_threshold\_warning](#input\_num\_process\_check\_threshold\_warning) | Num Proccess Check warning threshold | `number` | `2` | no | +| [num\_process\_check\_operator](#input\_num\_process\_check\_operator) | Operator for Num Process Check Query [available values: `<, >, <=, >=, =`] | `string` | `"<="` | no | +| [num\_process\_check\_threshold\_critical](#input\_num\_process\_check\_threshold\_critical) | Num Process Check critical threshold | `number` | `1` | no | +| [num\_process\_check\_threshold\_warning](#input\_num\_process\_check\_threshold\_warning) | Num Process Check warning threshold | `number` | `2` | no | | [num\_process\_check\_timeframe](#input\_num\_process\_check\_timeframe) | Monitor timeframe for Num Process Check [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_5m"` | no | | [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `0` | no | | [runbook\_link](#input\_runbook\_link) | Runbook link to include in message | `string` | `null` | no | diff --git a/host/process/README.md b/host/process/README.md index 93a1bb8..58382b3 100644 --- a/host/process/README.md +++ b/host/process/README.md @@ -51,9 +51,9 @@ No modules. | [notify\_warn\_override](#input\_notify\_warn\_override) | List of notifications for alerts in warning threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [process\_check\_enabled](#input\_process\_check\_enabled) | Flag to enable Process Check monitor | `string` | `"true"` | no | | [process\_check\_name](#input\_process\_check\_name) | Name of Process for Process Check Monitor | `string` | `""` | no | -| [process\_check\_threshold\_critical](#input\_process\_check\_threshold\_critical) | Proccess Check critical threshold | `number` | `5` | no | -| [process\_check\_threshold\_ok](#input\_process\_check\_threshold\_ok) | Proccess Check ok threshold | `number` | `1` | no | -| [process\_check\_threshold\_warning](#input\_process\_check\_threshold\_warning) | Proccess Check warning threshold | `number` | `2` | no | +| [process\_check\_threshold\_critical](#input\_process\_check\_threshold\_critical) | Process Check critical threshold | `number` | `5` | no | +| [process\_check\_threshold\_ok](#input\_process\_check\_threshold\_ok) | Process Check ok threshold | `number` | `1` | no | +| [process\_check\_threshold\_warning](#input\_process\_check\_threshold\_warning) | Process Check warning threshold | `number` | `2` | no | | [process\_check\_timeframe](#input\_process\_check\_timeframe) | Monitor timeframe for Process Check [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_5m"` | no | | [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `0` | no | | [runbook\_link](#input\_runbook\_link) | Runbook link to include in message | `string` | `null` | no |