From 3b57f5ed40974066e7b8fcd79254dc55e6a88bb7 Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Thu, 14 Mar 2024 15:55:13 +0530 Subject: [PATCH 1/3] Updated monitors tf & json with new TI operator --- .../app-modules/alb/app.tf | 2 +- .../app-modules/elb/app.tf | 2 +- .../app-modules/sns/app.tf | 2 +- .../app-modules/sqs/app.tf | 2 +- aws-observability/json/Alerts-App.json | 1282 +++++++++-------- 5 files changed, 714 insertions(+), 576 deletions(-) diff --git a/aws-observability-terraform/app-modules/alb/app.tf b/aws-observability-terraform/app-modules/alb/app.tf index 2ed1f7f0..ccdc3a43 100644 --- a/aws-observability-terraform/app-modules/alb/app.tf +++ b/aws-observability-terraform/app-modules/alb/app.tf @@ -27,7 +27,7 @@ module "alb_module" { monitor_is_disabled = var.monitors_disabled monitor_evaluation_delay = "0m" queries = { - A = "account=* region=* namespace=aws/applicationelb\n| parse \"* * * * * * * * * * * * \\\"*\\\" \\\"*\\\" * * * \\\"*\\\"\" as Type, DateTime, loadbalancer, Client, Target, RequestProcessingTime, TargetProcessingTime, ResponseProcessingTime, ElbStatusCode, TargetStatusCode, ReceivedBytes, SentBytes, Request, UserAgent, SslCipher, SslProtocol, TargetGroupArn, TraceId\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancer, account, region, namespace\n| lookup type, actor, raw, threatlevel as MaliciousConfidence from sumo://threat/cs on threat=ClientIp \n| json field=raw \"labels[*].name\" as LabelName \n| replace(LabelName, \"\\\\/\",\"->\") as LabelName\n| replace(LabelName, \"\\\"\",\" \") as LabelName\n| where type=\"ip_address\" and MaliciousConfidence=\"high\"\n| if (isEmpty(actor), \"Unassigned\", actor) as Actor\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancer, account, region, namespace, MaliciousConfidence, Actor, LabelName" + A = "account=* region=* namespace=aws/applicationelb\n| parse \"* * * * * * * * * * * * \\\"*\\\" \\\"*\\\" * * * \\\"*\\\"\" as Type, DateTime, loadbalancer, Client, Target, RequestProcessingTime, TargetProcessingTime, ResponseProcessingTime, ElbStatusCode, TargetStatusCode, ReceivedBytes, SentBytes, Request, UserAgent, SslCipher, SslProtocol, TargetGroupArn, TraceId\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancer, account, region, namespace\n| threatlookup singleIndicator ClientIp\n| where _threatlookup.threat_type=\"ip_address\" and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as MaliciousConfidence\n| where MaliciousConfidence=\"high\"\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancer, account, region, namespace, MaliciousConfidence, Actor" } triggers = [ { diff --git a/aws-observability-terraform/app-modules/elb/app.tf b/aws-observability-terraform/app-modules/elb/app.tf index 7816a6c7..45cd9475 100644 --- a/aws-observability-terraform/app-modules/elb/app.tf +++ b/aws-observability-terraform/app-modules/elb/app.tf @@ -25,7 +25,7 @@ module "classic_elb_module" { monitor_is_disabled = var.monitors_disabled monitor_evaluation_delay = "0m" queries = { - A = "account=* region=* namespace=aws/elb\n| parse \"* * * * * * * * * * * \\\"*\\\" \\\"*\\\" * *\" as datetime, loadbalancername, client, backend, request_processing_time, backend_processing_time, response_processing_time, elb_status_code, backend_status_code, received_bytes, sent_bytes, request, user_agent, ssl_cipher, ssl_protocol\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancername, account, region, namespace\n| lookup type, actor, raw, threatlevel as MaliciousConfidence from sumo://threat/cs on threat=ClientIp \n| json field=raw \"labels[*].name\" as LabelName \n| replace(LabelName, \"\\\\/\",\"->\") as LabelName\n| replace(LabelName, \"\\\"\",\" \") as LabelName\n| where type=\"ip_address\" and MaliciousConfidence=\"high\"\n| if (isEmpty(actor), \"Unassigned\", actor) as Actor\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancername, account, region, namespace, MaliciousConfidence, Actor, LabelName" + A = "account=* region=* namespace=aws/elb\n| parse \"* * * * * * * * * * * \\\"*\\\" \\\"*\\\" * *\" as datetime, loadbalancername, client, backend, request_processing_time, backend_processing_time, response_processing_time, elb_status_code, backend_status_code, received_bytes, sent_bytes, request, user_agent, ssl_cipher, ssl_protocol\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancername, account, region, namespace\n| threatlookup singleIndicator ClientIp\n| where _threatlookup.threat_type=\"ip_address\" and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as MaliciousConfidence\n| where MaliciousConfidence=\"high\"\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancername, account, region, namespace, MaliciousConfidence, Actor" } triggers = [ { diff --git a/aws-observability-terraform/app-modules/sns/app.tf b/aws-observability-terraform/app-modules/sns/app.tf index b9703c09..c819b52b 100644 --- a/aws-observability-terraform/app-modules/sns/app.tf +++ b/aws-observability-terraform/app-modules/sns/app.tf @@ -97,7 +97,7 @@ module "sns_module" { monitor_is_disabled = var.monitors_disabled monitor_evaluation_delay = "0m" queries = { - A = "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" sourceIPAddress \n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop \n| where event_source = \"sns.amazonaws.com\" \n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, user_type, arn, username nodrop \n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountid, user nodrop \n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop \n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn \n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn \n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop \n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop \n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname \n| if (isBlank(accountid), recipient_account_id, accountid) as accountid \n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| if (isEmpty(username), user_type, username) as user_type \n| count as ip_count by src_ip, event_name, region, accountid,user_type \n| lookup type, actor, raw, threatlevel as malicious_confidence from sumo://threat/cs on threat=src_ip \n| where type=\"ip_address\" and malicious_confidence = \"high\" \n| json field=raw \"labels[*].name\" as label_name \n| replace(label_name, \"\\\\/\",\"->\") as label_name \n| replace(label_name, \"\\\"\",\" \") as label_name \n| if (isEmpty(actor), \"Unassigned\", actor) as actor \n| sum(ip_count) as threat_count by src_ip, event_name, region, accountid, malicious_confidence, actor, label_name" + A = "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop\n| where event_source = \"sns.amazonaws.com\" \n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, user_type, arn, username nodrop\n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountid, user nodrop\n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop\n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn\n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn\n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop\n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop\n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname\n| if (isBlank(accountid), recipient_account_id, accountid) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status\n| if (isEmpty(username), user_type, username) as user_type\n| count as ip_count by src_ip, event_name, region, accountid,user_type\n| threatlookup singleIndicator src_ip\n| where _threatlookup.threat_type=\"ip_address\" and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as malicious_confidence\n| where malicious_confidence=\"high\"\n| sum(ip_count) as threat_count by src_ip, event_name, region, accountid, malicious_confidence, actor" } triggers = [ diff --git a/aws-observability-terraform/app-modules/sqs/app.tf b/aws-observability-terraform/app-modules/sqs/app.tf index fe0072d4..ae9b7caf 100644 --- a/aws-observability-terraform/app-modules/sqs/app.tf +++ b/aws-observability-terraform/app-modules/sqs/app.tf @@ -97,7 +97,7 @@ module "sqs_module" { monitor_is_disabled = var.monitors_disabled monitor_evaluation_delay = "0m" queries = { - A = "account=* region=* namespace=\"aws/sqs\" eventname eventsource \"sqs.amazonaws.com\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"sourceIPAddress\",\"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, recipient_account_id, requestParameters, responseElements, src_ip, error_code, error_message nodrop\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| json field=requestParameters \"queueUrl\" as queueUrlReq nodrop \n| json field=responseElements \"queueUrl\" as queueUrlRes nodrop\n| where event_source=\"sqs.amazonaws.com\" and !(src_ip matches \"*.amazonaws.com\")\n| if(event_name=\"CreateQueue\", queueUrlRes, queueUrlReq) as queueUrl \n| parse regex field=queueUrl \"(?[^\\/]*$)\"\n| if (isBlank(recipient_account_id), accountid, recipient_account_id) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| count as ip_count by src_ip\n| lookup type, actor, raw, threatlevel as malicious_confidence from sumo://threat/cs on threat=src_ip\n| json field=raw \"labels[*].name\" as label_name \n| replace(label_name, \"\\\\/\",\"->\") as label_name\n| replace(label_name, \"\\\"\",\" \") as label_name\n| if (isEmpty(actor), \"Unassigned\", actor) as actor\n| where type=\"ip_address\" and malicious_confidence = \"high\"\n| sort by ip_count, src_ip\n| fields src_ip, malicious_confidence, actor, label_name" + A = "account=* region=* namespace=\"aws/sqs\" eventname eventsource \"sqs.amazonaws.com\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"sourceIPAddress\",\"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, recipient_account_id, requestParameters, responseElements, src_ip, error_code, error_message nodrop\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| json field=requestParameters \"queueUrl\" as queueUrlReq nodrop \n| json field=responseElements \"queueUrl\" as queueUrlRes nodrop\n| where event_source=\"sqs.amazonaws.com\" and !(src_ip matches \"*.amazonaws.com\")\n| if(event_name=\"CreateQueue\", queueUrlRes, queueUrlReq) as queueUrl \n| parse regex field=queueUrl \"(?[^\\/]*$)\"\n| if (isBlank(recipient_account_id), accountid, recipient_account_id) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| count as ip_count by src_ip\n| threatlookup singleIndicator src_ip\n| where _threatlookup.threat_type=\"ip_address\" and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as malicious_confidence\n| where malicious_confidence=\"high\"\n| sort by ip_count, src_ip\n| fields src_ip, malicious_confidence, actor" } triggers = [ { diff --git a/aws-observability/json/Alerts-App.json b/aws-observability/json/Alerts-App.json index e60038f0..33be1e9a 100644 --- a/aws-observability/json/Alerts-App.json +++ b/aws-observability/json/Alerts-App.json @@ -4,26 +4,69 @@ "type": "MonitorsLibraryFolderExport", "children": [ { - "name": "AWS API Gateway - High 5XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", + "name": "Amazon Elasticache - Multiple Failed Operations", + "description": "This alert fires when we detect multiple failed operations within a 15 minute interval for an ElastiCache service.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway metric=5xxError Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" - }, + "query": "account=* region=* namespace=aws/elasticache \"\\\"eventSource\\\":\\\"elasticache.amazonaws.com\\\"\" errorCode errorMessage\n| json \"eventSource\", \"errorCode\", \"errorMessage\", \"userIdentity\", \"requestParameters\", \"responseElements\" as event_source, error_code, error_message, user_identity, requestParameters, responseElements nodrop\n| json field=requestParameters \"cacheClusterId\" as req_cacheClusterId nodrop\n| json field=responseElements \"cacheClusterId\" as res_cacheClusterId nodrop\n| json field=user_identity \"arn\", \"userName\" nodrop \n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop\n| if (isEmpty(userName), user, userName) as user\n| if (isEmpty(req_cacheClusterId), res_cacheClusterId, req_cacheClusterId) as cacheclusterid\n| where event_source matches \"elasticache.amazonaws.com\" and !isEmpty(error_code) and !isEmpty(error_message) and !isEmpty(user)\n| count as event_count by _messageTime, account, region, event_source, error_code, error_message, user, cacheclusterid\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, event_source, error_code, error_message, user, cacheclusterid\n| fields -_messageTime" + } + ], + "triggers": [ { - "rowId": "B", - "query": "Namespace=aws/apigateway metric=count Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + "detectionMethod": "StaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-15m", + "threshold": 10, + "thresholdType": "GreaterThanOrEqual", + "field": null, + "occurrenceType": "ResultCount", + "triggerSource": "AllResults", + "minDataPoints": null }, { - "rowId": "C", - "query": "#A * 100 / #B along account, region, namespace" + "detectionMethod": "StaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-15m", + "threshold": 10, + "thresholdType": "LessThan", + "field": null, + "occurrenceType": "ResultCount", + "triggerSource": "AllResults", + "minDataPoints": null + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "Amazon RDS - High Read Latency", + "description": "This alert fires when the average read latency of a database within a 5 minutes time inerval is high (>=5 seconds). High read latency will affect the performance of your application.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Metrics", + "evaluationDelay": "4m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "Namespace=aws/rds metric=ReadLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ @@ -52,66 +95,68 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High System CPU Utilization", - "description": "This alert fires when the average system CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", + "name": "AWS SNS - Failed Notifications", + "description": "This alert fires where there are many failed notifications (>2) within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=CPU_Sys account=* region=* instanceid=* | avg by account, region, namespace, instanceid" + "query": "account=* region=* namespace=aws/sns TopicName=* metric=NumberOfNotificationsFailed Statistic=Sum | sum by account, region, TopicName" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "threshold": 2, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "LessThan", - "field": null, + "threshold": 2, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS SNS - Failed Events", - "description": "This alert fires when an SNS app has high number of failed events (>5) within last 5 minutes", + "name": "AWS EC2 - High Total CPU Utilization", + "description": "This alert fires when the average total CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", + "monitorType": "Metrics", "evaluationDelay": "0m", "alertName": null, "runAs": null, @@ -119,39 +164,48 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" errorCode\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop\n| where event_source = \"sns.amazonaws.com\" and !isblank(error_code)\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop\n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop\n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn\n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn\n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop\n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop\n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname\n| if (isBlank(accountid), recipient_account_id, accountid) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status\n| if (isEmpty(username), user, username) as user\n| count as event_count by event_name, error_code, error_message, region, src_ip, accountid, user, type, request_id, topicname, topic_arn, user_agent\n" + "query": "Namespace=aws/ec2 metric=CPU_Total account=* region=* instanceid=* | avg by account, region, namespace, instanceid" } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThan", - "field": null + "threshold": 85, + "thresholdType": "GreaterThanOrEqual", + "field": null, + "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", + "minDataPoints": 2 }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThanOrEqual", - "field": null + "threshold": 85, + "thresholdType": "LessThan", + "field": null, + "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": true, + "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Account Provisioned Write Capacity", - "description": "This alert fires when we detect that the average write capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "AWS Application Load Balancer - High Latency", + "description": "This alert fires when we detect that the average latency for a given Application load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -161,7 +215,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=AccountProvisionedWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "Namespace=aws/applicationelb metric=TargetResponseTime Statistic=Average account=* region=* loadbalancer=* | eval(_value*1000) | sum by account, region, namespace, loadbalancer" } ], "triggers": [ @@ -170,7 +224,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 3000, "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", @@ -182,7 +236,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 3000, "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", @@ -190,12 +244,15 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { "name": "AWS SNS - Access from Highly Malicious Sources", @@ -209,7 +266,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop\n| where event_source = \"sns.amazonaws.com\" \n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, user_type, arn, username nodrop\n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountid, user nodrop\n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop\n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn\n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn\n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop\n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop\n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname\n| if (isBlank(accountid), recipient_account_id, accountid) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status\n| if (isEmpty(username), user_type, username) as user_type\n| count as ip_count by src_ip, event_name, region, accountid,user_type\n| lookup type, actor, raw, threatlevel as malicious_confidence from sumo://threat/cs on threat=src_ip\n| where type=\"ip_address\" and malicious_confidence = \"high\"\n| json field=raw \"labels[*].name\" as label_name \n| replace(label_name, \"\\\\/\",\"->\") as label_name\n| replace(label_name, \"\\\"\",\" \") as label_name\n| if (isEmpty(actor), \"Unassigned\", actor) as actor\n| sum(ip_count) as threat_count by src_ip, event_name, region, accountid, malicious_confidence, actor, label_name\n" + "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop\n| where event_source = \"sns.amazonaws.com\" \n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, user_type, arn, username nodrop\n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountid, user nodrop\n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop\n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn\n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn\n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop\n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop\n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname\n| if (isBlank(accountid), recipient_account_id, accountid) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status\n| if (isEmpty(username), user_type, username) as user_type\n| count as ip_count by src_ip, event_name, region, accountid,user_type\n| threatlookup singleIndicator src_ip\n| where _threatlookup.threat_type=\"ip_address\" and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as malicious_confidence\n| where malicious_confidence=\"high\"\n| sum(ip_count) as threat_count by src_ip, event_name, region, accountid, malicious_confidence, actor" } ], "triggers": [ @@ -232,16 +289,19 @@ "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High Integration Latency", - "description": "This alert fires when we detect that the average integration latency for a given API Gateway is greater than or equal to one second for 5 minutes.", + "name": "AWS DynamoDB - High Max Provisioned Table Read Capacity", + "description": "This alert fires when we detect that the average percentage of read provisioned capacity used by the highest read provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -251,7 +311,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway metric=IntegrationLatency statistic=Average account=* region=* apiname=* | avg by apiname, namespace, region, account" + "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" } ], "triggers": [ @@ -260,7 +320,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, + "threshold": 80, "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", @@ -272,7 +332,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, + "threshold": 80, "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", @@ -280,16 +340,19 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Lambda - High Percentage of Failed Requests", - "description": "This alert fires when we detect a large number of failed Lambda requests (>5%) within an interval of 5 minutes.", + "name": "AWS API Gateway - High 4XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -299,15 +362,15 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/lambda metric=Errors Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" + "query": "Namespace=aws/apigateway metric=4xxError Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" }, { "rowId": "B", - "query": "Namespace=aws/lambda metric=Invocations Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" + "query": "Namespace=aws/apigateway metric=count Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" }, { "rowId": "C", - "query": "#A * 100 / #B along functionname, account, region, namespace" + "query": "#A * 100 / #B along apiname, account, region, namespace" } ], "triggers": [ @@ -336,16 +399,19 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - Low Aurora Buffer Cache Hit Ratio", - "description": "This alert fires when the average RDS Aurora buffer cache hit ratio within a 5 minute interval is low (<= 50%). This indicates that a lower percentage of requests were are served by the buffer cache, which could further indicate a degradation in application performance.", + "name": "AWS Classic Load Balancer - High 4XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -355,45 +421,52 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=BufferCacheHitRatio statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/elb metric=HTTPCode_ELB_4XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along loadbalancername, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "LessThanOrEqual", - "field": null, + "threshold": 5, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "GreaterThan", - "field": null, + "threshold": 5, + "thresholdType": "LessThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Lambda - Low Provisioned Concurrency Utilization", - "description": "This alert fires when the average provisioned concurrency utilization for 5 minutes is low (<= 50%). This indicates low provisioned concurrency utilization efficiency.", + "name": "Amazon RDS - High Write Latency", + "description": "This alert fires when the average write latency of a database within a 5 minute interval is high (>=5 seconds) . High write latencies will affect the performance of your application.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -403,7 +476,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/lambda metric=ProvisionedConcurrencyUtilization statistic=Average account=* region=* functionname=* | avg by functionname, namespace, region, account" + "query": "Namespace=aws/rds metric=WriteLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ @@ -412,8 +485,8 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "LessThanOrEqual", + "threshold": 5, + "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", "triggerSource": "AnyTimeSeries", @@ -424,34 +497,37 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "GreaterThan", + "threshold": 5, + "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - Low Redis Cache Hit Rate", - "description": "This alert fires when the average cache hit rate for Redis within a 5 minute interval is low (<= 80%). This indicates low efficiency of the Redis instance. If cache ratio is lower than 80%, that indicates a significant amount of keys are either evicted, expired, or don't exist.", + "name": "AWS Application Load Balancer - Access from Highly Malicious Sources", + "description": "This alert fires when an Application load balancer is accessed from highly malicious IP addresses within last 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=CacheHitRate statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" + "query": "account=* region=* namespace=aws/applicationelb\n| parse \"* * * * * * * * * * * * \\\"*\\\" \\\"*\\\" * * * \\\"*\\\"\" as Type, DateTime, loadbalancer, Client, Target, RequestProcessingTime, TargetProcessingTime, ResponseProcessingTime, ElbStatusCode, TargetStatusCode, ReceivedBytes, SentBytes, Request, UserAgent, SslCipher, SslProtocol, TargetGroupArn, TraceId\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancer, account, region, namespace\n| threatlookup singleIndicator ClientIp\n| where _threatlookup.threat_type=\"ip_address\" and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as MaliciousConfidence\n| where MaliciousConfidence=\"high\"\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancer, account, region, namespace, MaliciousConfidence, Actor" } ], "triggers": [ @@ -460,36 +536,39 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, - "thresholdType": "LessThanOrEqual", + "threshold": 0, + "thresholdType": "GreaterThan", "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "occurrenceType": "ResultCount", + "triggerSource": "AllResults", + "minDataPoints": null }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, - "thresholdType": "GreaterThan", + "threshold": 0, + "thresholdType": "LessThanOrEqual", "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "occurrenceType": "ResultCount", + "triggerSource": "AllResults", + "minDataPoints": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS SQS - Message processing not fast enough", - "description": "This alert fires when we detect message processing is not fast enough. That is, the average approximate age of the oldest non-deleted message in the queue is more than 5 seconds for an interval of 5 minutes.", + "name": "Amazon Elasticache - High CPU Utilization", + "description": "This alert fires when the average CPU utilization within a 5 minute interval for a host is high (>=90%). The CPUUtilization metric includes total CPU utilization across application, operating system and management processes. We highly recommend monitoring CPU utilization for hosts with two vCPUs or less.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -499,51 +578,58 @@ "queries": [ { "rowId": "A", - "query": "metric=ApproximateAgeOfOldestMessage Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account,region,namespace,queuename " + "query": "Namespace=aws/elasticache metric=CPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThan", + "threshold": 90, + "thresholdType": "GreaterThanOrEqual", + "field": null, "occurrenceType": "Always", - "minDataPoints": 3 + "triggerSource": "AnyTimeSeries", + "minDataPoints": 2 }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThanOrEqual", + "threshold": 90, + "thresholdType": "LessThan", + "field": null, "occurrenceType": "Always", - "minDataPoints": 3 + "triggerSource": "AnyTimeSeries", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": true, + "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High Memory Utilization", - "description": "This alert fires when the average memory utilization within a 5 minute interval for an EC2 instance is high (>=85%).", + "name": "AWS API Gateway - High Integration Latency", + "description": "This alert fires when we detect that the average integration latency for a given API Gateway is greater than or equal to one second for 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=Mem_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid" + "query": "Namespace=aws/apigateway metric=IntegrationLatency statistic=Average account=* region=* apiname=* | avg by apiname, namespace, region, account" } ], "triggers": [ @@ -552,7 +638,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 1000, "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", @@ -564,7 +650,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 1000, "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", @@ -572,60 +658,19 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS SNS - Failed Notifications", - "description": "This alert fires where there are many failed notifications (>2) within an interval of 5 minutes.", - "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", - "alertName": null, - "runAs": null, - "notificationGroupFields": [], - "queries": [ - { - "rowId": "A", - "query": "account=* region=* namespace=aws/sns TopicName=* metric=NumberOfNotificationsFailed Statistic=Sum | sum by account, region, TopicName" - } - ], - "triggers": [ - { - "detectionMethod": "MetricsStaticCondition", - "triggerType": "Critical", - "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 2, - "thresholdType": "GreaterThan", - "occurrenceType": "Always", - "minDataPoints": 2 - }, - { - "detectionMethod": "MetricsStaticCondition", - "triggerType": "ResolvedCritical", - "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 2, - "thresholdType": "LessThanOrEqual", - "occurrenceType": "Always", - "minDataPoints": 2 - } - ], - "notifications": [], - "isDisabled": true, - "groupNotifications": true, - "playbook": "", - "sloId": null, - "monitorTemplateId": null - }, - { - "name": "Amazon RDS - High Read Latency", - "description": "This alert fires when the average read latency of a database within a 5 minutes time inerval is high (>=5 seconds). High read latency will affect the performance of your application.", + "name": "Amazon Elasticache - High Redis Database Memory Usage", + "description": "This alert fires when the average database memory usage within a 5 minute interval for the Redis engine is high (>=95%). When the value reaches 100%, eviction may happen or write operations may fail based on ElastiCache policies thereby impacting application performance.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -635,7 +680,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=ReadLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/elasticache metric=DatabaseMemoryUsagePercentage statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" } ], "triggers": [ @@ -644,7 +689,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 95, "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", @@ -656,7 +701,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 95, "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", @@ -664,60 +709,70 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS SQS - Messages not processed", - "description": "This alert fires when we detect messages that have been received by a consumer, but have not been processed (deleted/failed). That is, the average number of messages that are in flight are >=20 for an interval of 5 minutes.", + "name": "AWS DynamoDB - Multiple Tables deleted", + "description": "This alert fires when five or more tables are deleted within 15 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "metric=ApproximateNumberOfMessagesNotVisible Statistic=avg region = * account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " + "query": "account=* region=* namespace=aws/dynamodb eventSource \"dynamodb.amazonaws.com\"\n| json \"eventSource\", \"eventName\", \"requestParameters.tableName\", \"sourceIPAddress\", \"userIdentity.userName\", \"userIdentity.sessionContext.sessionIssuer.userName\" as event_source, event_name, tablename, SourceIp, UserName, ContextUserName nodrop\n| where event_source = \"dynamodb.amazonaws.com\" and event_name = \"DeleteTable\"\n| if (isEmpty(UserName), ContextUserName, UserName) as user\n| count by _messageTime, account, region, namespace, event_name, user, tablename\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, namespace, event_name, user, tablename\n| fields -_messageTime" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 20, + "timeRange": "-15m", + "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "occurrenceType": "Always", - "minDataPoints": 3 + "field": null, + "occurrenceType": "ResultCount", + "triggerSource": "AllResults", + "minDataPoints": null }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 20, + "timeRange": "-15m", + "threshold": 5, "thresholdType": "LessThan", - "occurrenceType": "Always", - "minDataPoints": 3 + "field": null, + "occurrenceType": "ResultCount", + "triggerSource": "AllResults", + "minDataPoints": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High 4XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", + "name": "AWS DynamoDB - High Max Provisioned Table Write Capacity", + "description": "This alert fires when we detect that the average percentage of write provisioned capacity used by the highest write provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -727,15 +782,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway metric=4xxError Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/apigateway metric=count Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along apiname, account, region, namespace" + "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" } ], "triggers": [ @@ -744,7 +791,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 80, "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", @@ -756,7 +803,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 80, "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", @@ -764,16 +811,19 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - High 4XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", + "name": "Amazon RDS - Low Aurora Buffer Cache Hit Ratio", + "description": "This alert fires when the average RDS Aurora buffer cache hit ratio within a 5 minute interval is low (<= 50%). This indicates that a lower percentage of requests were are served by the buffer cache, which could further indicate a degradation in application performance.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -783,15 +833,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_4XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/applicationelb metric=RequestCount Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along loadbalancer, account, region, namespace" + "query": "Namespace=aws/rds metric=BufferCacheHitRatio statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ @@ -800,8 +842,8 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThanOrEqual", + "threshold": 50, + "thresholdType": "LessThanOrEqual", "field": null, "occurrenceType": "Always", "triggerSource": "AnyTimeSeries", @@ -812,24 +854,27 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThan", + "threshold": 50, + "thresholdType": "GreaterThan", "field": null, "occurrenceType": "Always", "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Network Load Balancer - High TLS Negotiation Errors", - "description": "This alert fires when we detect that there are too many TLS Negotiation Errors (>=10%) within an interval of 5 minutes for a given network load balancer", + "name": "Amazon Elasticache - Low Redis Cache Hit Rate", + "description": "This alert fires when the average cache hit rate for Redis within a 5 minute interval is low (<= 80%). This indicates low efficiency of the Redis instance. If cache ratio is lower than 80%, that indicates a significant amount of keys are either evicted, expired, or don't exist.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -839,15 +884,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/NetworkELB metric=ClientTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/NetworkELB metric=TargetTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" - }, - { - "rowId": "C", - "query": "(#A + #B) along LoadBalancer, account, region, namespace" + "query": "Namespace=aws/elasticache metric=CacheHitRate statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" } ], "triggers": [ @@ -856,8 +893,8 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "GreaterThanOrEqual", + "threshold": 80, + "thresholdType": "LessThanOrEqual", "field": null, "occurrenceType": "Always", "triggerSource": "AnyTimeSeries", @@ -868,24 +905,27 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "LessThan", + "threshold": 80, + "thresholdType": "GreaterThan", "field": null, "occurrenceType": "Always", "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS SQS - Queue has stopped receiving messages", - "description": "This alert fires when we detect that the queue has stopped receiving messages. That is, the average number of messages received in the queue <1 for an interval of 30 minutes.", + "name": "Amazon Elasticache - High Redis Memory Fragmentation Ratio", + "description": "This alert fires when the average Redis memory fragmentation ratio for within a 5 minute interval is high (>=1.5). Value equal to or greater than 1.5 Indicate significant memory fragmentation.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -895,41 +935,48 @@ "queries": [ { "rowId": "A", - "query": "metric=NumberOfMessagesReceived Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " + "query": "Namespace=aws/elasticache metric=MemoryFragmentationRatio statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-30m", - "threshold": 1, - "thresholdType": "LessThan", + "timeRange": "-5m", + "threshold": 1.5, + "thresholdType": "GreaterThanOrEqual", + "field": null, "occurrenceType": "Always", - "minDataPoints": 3 + "triggerSource": "AnyTimeSeries", + "minDataPoints": 2 }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-30m", - "threshold": 1, - "thresholdType": "GreaterThanOrEqual", + "timeRange": "-5m", + "threshold": 1.5, + "thresholdType": "LessThan", + "field": null, "occurrenceType": "Always", - "minDataPoints": 3 + "triggerSource": "AnyTimeSeries", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": true, + "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High Disk Utilization", - "description": "This alert fires when the average disk utilization within a 5 minute time interval for an EC2 instance is high (>=85%).", + "name": "AWS EC2 - High Memory Utilization", + "description": "This alert fires when the average memory utilization within a 5 minute interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "0m", @@ -939,7 +986,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=Disk_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid, devname" + "query": "Namespace=aws/ec2 metric=Mem_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid" } ], "triggers": [ @@ -968,26 +1015,37 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - Multiple Tables deleted", - "description": "This alert fires when five or more tables are deleted within 15 minutes.", + "name": "AWS Application Load Balancer - High 5XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/dynamodb eventSource \"dynamodb.amazonaws.com\"\n| json \"eventSource\", \"eventName\", \"requestParameters.tableName\", \"sourceIPAddress\", \"userIdentity.userName\", \"userIdentity.sessionContext.sessionIssuer.userName\" as event_source, event_name, tablename, SourceIp, UserName, ContextUserName nodrop\n| where event_source = \"dynamodb.amazonaws.com\" and event_name = \"DeleteTable\"\n| if (isEmpty(UserName), ContextUserName, UserName) as user\n| count by _messageTime, account, region, namespace, event_name, user, tablename\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, namespace, event_name, user, tablename\n| fields -_messageTime" + "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_5XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/applicationelb metric=RequestCount Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along loadbalancer, account, region, namespace" } ], "triggers": [ @@ -995,137 +1053,136 @@ "detectionMethod": "StaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-15m", + "timeRange": "-5m", "threshold": 5, "thresholdType": "GreaterThanOrEqual", "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", + "minDataPoints": 2 }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-15m", + "timeRange": "-5m", "threshold": 5, "thresholdType": "LessThan", "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": true, + "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - High 4XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", + "name": "AWS EC2 - High System CPU Utilization", + "description": "This alert fires when the average system CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "4m", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/elb metric=HTTPCode_ELB_4XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along loadbalancername, account, region, namespace" + "query": "Namespace=aws/ec2 metric=CPU_Sys account=* region=* instanceid=* | avg by account, region, namespace, instanceid" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 85, "thresholdType": "GreaterThanOrEqual", + "field": null, "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 85, "thresholdType": "LessThan", + "field": null, "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Max Provisioned Table Read Capacity", - "description": "This alert fires when we detect that the average percentage of read provisioned capacity used by the highest read provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "AWS SNS - Failed Events", + "description": "This alert fires when an SNS app has high number of failed events (>5) within last 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" errorCode\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop\n| where event_source = \"sns.amazonaws.com\" and !isblank(error_code)\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop\n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop\n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn\n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn\n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop\n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop\n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname\n| if (isBlank(accountid), recipient_account_id, accountid) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status\n| if (isEmpty(username), user, username) as user\n| count as event_count by event_name, error_code, error_message, region, src_ip, accountid, user, type, request_id, topicname, topic_arn, user_agent\n" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 5, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 5, + "thresholdType": "LessThanOrEqual", + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High Redis Database Memory Usage", - "description": "This alert fires when the average database memory usage within a 5 minute interval for the Redis engine is high (>=95%). When the value reaches 100%, eviction may happen or write operations may fail based on ElastiCache policies thereby impacting application performance.", + "name": "AWS Network Load Balancer - High Unhealthy Hosts", + "description": "This alert fires when we detect that are there are too many unhealthy hosts (>=10%) within an interval of 5 minutes for a given network load balancer", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1135,7 +1192,15 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=DatabaseMemoryUsagePercentage statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" + "query": "Namespace=aws/NetworkELB metric=UnHealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/NetworkELB metric=HealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / (#A + #B) along LoadBalancer, AvailabilityZone, account, region, namespace" } ], "triggers": [ @@ -1144,7 +1209,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 95, + "threshold": 10, "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", @@ -1156,7 +1221,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 95, + "threshold": 10, "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", @@ -1164,16 +1229,19 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon ECS - High CPU Utilization", - "description": "This alert fires when the average CPU utilization within a 5 minute interval for a service within a cluster is high (>=85%).", + "name": "AWS DynamoDB - High Account Provisioned Read Capacity", + "description": "This alert fires when we detect that the average read capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1183,7 +1251,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/ecs metric=CPUUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" + "query": "Namespace=aws/dynamodb metric=AccountProvisionedReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" } ], "triggers": [ @@ -1192,7 +1260,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 80, "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", @@ -1204,7 +1272,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 80, "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", @@ -1212,16 +1280,19 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High Redis Memory Fragmentation Ratio", - "description": "This alert fires when the average Redis memory fragmentation ratio for within a 5 minute interval is high (>=1.5). Value equal to or greater than 1.5 Indicate significant memory fragmentation.", + "name": "Amazon RDS - High CPU Utilization", + "description": "This alert fires when we detect that the average CPU utilization for a database is high (>=85%) for an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1231,7 +1302,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=MemoryFragmentationRatio statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" + "query": "Namespace=aws/rds metric=CPUUtilization statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ @@ -1240,7 +1311,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1.5, + "threshold": 85, "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", @@ -1252,7 +1323,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1.5, + "threshold": 85, "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", @@ -1260,16 +1331,19 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High CPU Utilization", - "description": "This alert fires when the average CPU utilization within a 5 minute interval for a host is high (>=90%). The CPUUtilization metric includes total CPU utilization across application, operating system and management processes. We highly recommend monitoring CPU utilization for hosts with two vCPUs or less.", + "name": "AWS Application Load Balancer - High 4XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1279,7 +1353,15 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=CPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" + "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_4XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/applicationelb metric=RequestCount Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along loadbalancer, account, region, namespace" } ], "triggers": [ @@ -1288,7 +1370,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, + "threshold": 5, "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", @@ -1300,7 +1382,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, + "threshold": 5, "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", @@ -1308,16 +1390,19 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Account Provisioned Read Capacity", - "description": "This alert fires when we detect that the average read capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "AWS API Gateway - High 5XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1327,7 +1412,15 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=AccountProvisionedReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "Namespace=aws/apigateway metric=5xxError Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/apigateway metric=count Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along account, region, namespace" } ], "triggers": [ @@ -1336,7 +1429,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 5, "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", @@ -1348,7 +1441,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 5, "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", @@ -1356,58 +1449,66 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS SQS - Access from highly malicious sources", - "description": "This alert fires when an AWS - SQS resource is accessed from highly malicious IP addresses within last 5 minutes", + "name": "AWS Classic Load Balancer - High Latency", + "description": "This alert fires when we detect that the average latency for a given Classic load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=\"aws/sqs\" eventname eventsource \"sqs.amazonaws.com\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"sourceIPAddress\",\"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, recipient_account_id, requestParameters, responseElements, src_ip, error_code, error_message nodrop\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| json field=requestParameters \"queueUrl\" as queueUrlReq nodrop \n| json field=responseElements \"queueUrl\" as queueUrlRes nodrop\n| where event_source=\"sqs.amazonaws.com\" and !(src_ip matches \"*.amazonaws.com\")\n| if(event_name=\"CreateQueue\", queueUrlRes, queueUrlReq) as queueUrl \n| parse regex field=queueUrl \"(?[^\\/]*$)\"\n| if (isBlank(recipient_account_id), accountid, recipient_account_id) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| count as ip_count by src_ip\n| lookup type, actor, raw, threatlevel as malicious_confidence from sumo://threat/cs on threat=src_ip\n| json field=raw \"labels[*].name\" as label_name \n| replace(label_name, \"\\\\/\",\"->\") as label_name\n| replace(label_name, \"\\\"\",\" \") as label_name\n| if (isEmpty(actor), \"Unassigned\", actor) as actor\n| where type=\"ip_address\" and malicious_confidence = \"high\"\n| sort by ip_count, src_ip\n| fields src_ip, malicious_confidence, actor, label_name" + "query": "Namespace=aws/elb metric=Latency Statistic=Average account=* region=* loadbalancername=* | eval(_value*1000) | sum by account, region, namespace, loadbalancername" } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", - "field": null + "threshold": 3000, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", - "field": null + "threshold": 3000, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": true, + "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High Latency", - "description": "This alert fires when we detect that the average latency for a given API Gateway is greater than or equal to one second for 5 minutes.", + "name": "AWS DynamoDB - High Account Provisioned Write Capacity", + "description": "This alert fires when we detect that the average write capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1417,7 +1518,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway metric=Latency statistic=Average account=* region=* apiname=* | avg by apiname, namespace, region, account" + "query": "Namespace=aws/dynamodb metric=AccountProvisionedWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" } ], "triggers": [ @@ -1426,7 +1527,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, + "threshold": 80, "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", @@ -1438,7 +1539,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, + "threshold": 80, "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", @@ -1446,16 +1547,19 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - High 5XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", + "name": "AWS Lambda - Low Provisioned Concurrency Utilization", + "description": "This alert fires when the average provisioned concurrency utilization for 5 minutes is low (<= 50%). This indicates low provisioned concurrency utilization efficiency.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1465,101 +1569,58 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elb metric=HTTPCode_ELB_5XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along loadbalancername, account, region, namespace" + "query": "Namespace=aws/lambda metric=ProvisionedConcurrencyUtilization statistic=Average account=* region=* functionname=* | avg by functionname, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThanOrEqual", + "threshold": 50, + "thresholdType": "LessThanOrEqual", + "field": null, "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThan", + "threshold": 50, + "thresholdType": "GreaterThan", + "field": null, "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null - }, - { - "name": "AWS Classic Load Balancer - Access from Highly Malicious Sources", - "description": "This alert fires when the Classic load balancer is accessed from highly malicious IP addresses within last 5 minutes.", - "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", - "alertName": null, - "runAs": null, - "notificationGroupFields": [], - "queries": [ - { - "rowId": "A", - "query": "account=* region=* namespace=aws/elb\n| parse \"* * * * * * * * * * * \\\"*\\\" \\\"*\\\" * *\" as datetime, loadbalancername, client, backend, request_processing_time, backend_processing_time, response_processing_time, elb_status_code, backend_status_code, received_bytes, sent_bytes, request, user_agent, ssl_cipher, ssl_protocol\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancername, account, region, namespace\n| lookup type, actor, raw, threatlevel as MaliciousConfidence from sumo://threat/cs on threat=ClientIp \n| json field=raw \"labels[*].name\" as LabelName \n| replace(LabelName, \"\\\\/\",\"->\") as LabelName\n| replace(LabelName, \"\\\"\",\" \") as LabelName\n| where type=\"ip_address\" and MaliciousConfidence=\"high\"\n| if (isEmpty(actor), \"Unassigned\", actor) as Actor\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancername, account, region, namespace, MaliciousConfidence, Actor, LabelName" - } - ], - "triggers": [ - { - "detectionMethod": "LogsStaticCondition", - "triggerType": "Critical", - "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", - "field": null - }, - { - "detectionMethod": "LogsStaticCondition", - "triggerType": "ResolvedCritical", - "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", - "field": null - } - ], - "notifications": [], - "isDisabled": true, - "groupNotifications": true, - "playbook": "", - "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High Total CPU Utilization", - "description": "This alert fires when the average total CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", + "name": "Amazon ECS - High Memory Utilization", + "description": "This alert fires when the average memory utilization within a 5 minute interval for a service within a cluster is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=CPU_Total account=* region=* instanceid=* | avg by account, region, namespace, instanceid" + "query": "Namespace=aws/ecs metric=MemoryUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" } ], "triggers": [ @@ -1588,64 +1649,64 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High Write Latency", - "description": "This alert fires when the average write latency of a database within a 5 minute interval is high (>=5 seconds) . High write latencies will affect the performance of your application.", + "name": "AWS SQS - Access from highly malicious sources", + "description": "This alert fires when an AWS - SQS resource is accessed from highly malicious IP addresses within last 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=WriteLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "account=* region=* namespace=\"aws/sqs\" eventname eventsource \"sqs.amazonaws.com\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"sourceIPAddress\",\"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, recipient_account_id, requestParameters, responseElements, src_ip, error_code, error_message nodrop\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| json field=requestParameters \"queueUrl\" as queueUrlReq nodrop \n| json field=responseElements \"queueUrl\" as queueUrlRes nodrop\n| where event_source=\"sqs.amazonaws.com\" and !(src_ip matches \"*.amazonaws.com\")\n| if(event_name=\"CreateQueue\", queueUrlRes, queueUrlReq) as queueUrl \n| parse regex field=queueUrl \"(?[^\\/]*$)\"\n| if (isBlank(recipient_account_id), accountid, recipient_account_id) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| count as ip_count by src_ip\n| threatlookup singleIndicator src_ip\n| where _threatlookup.threat_type=\"ip_address\" and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as malicious_confidence\n| where malicious_confidence=\"high\"\n| sort by ip_count, src_ip\n| fields src_ip, malicious_confidence, actor" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 0, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 0, + "thresholdType": "LessThanOrEqual", + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High Disk Queue Depth", - "description": "This alert fires when the average disk queue depth for a database is high (>=5) for an interval of 5 minutes. Higher this value, higher will be the number of outstanding I/Os (read/write requests) waiting to access the disk, which will impact the performance of your application.", + "name": "AWS Classic Load Balancer - High 5XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1655,55 +1716,62 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=DiskQueueDepth statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/elb metric=HTTPCode_ELB_5XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along loadbalancername, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon ECS - High Memory Utilization", - "description": "This alert fires when the average memory utilization within a 5 minute interval for a service within a cluster is high (>=85%).", + "name": "AWS EC2 - High Disk Utilization", + "description": "This alert fires when the average disk utilization within a 5 minute time interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "4m", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ecs metric=MemoryUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" + "query": "Namespace=aws/ec2 metric=Disk_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid, devname" } ], "triggers": [ @@ -1732,16 +1800,19 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Max Provisioned Table Write Capacity", - "description": "This alert fires when we detect that the average percentage of write provisioned capacity used by the highest write provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "AWS API Gateway - High Latency", + "description": "This alert fires when we detect that the average latency for a given API Gateway is greater than or equal to one second for 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1751,7 +1822,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "Namespace=aws/apigateway metric=Latency statistic=Average account=* region=* apiname=* | avg by apiname, namespace, region, account" } ], "triggers": [ @@ -1760,7 +1831,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 1000, "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", @@ -1772,7 +1843,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 1000, "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", @@ -1780,16 +1851,19 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - High 5XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", + "name": "AWS Lambda - High Percentage of Failed Requests", + "description": "This alert fires when we detect a large number of failed Lambda requests (>5%) within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1799,15 +1873,15 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_5XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + "query": "Namespace=aws/lambda metric=Errors Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" }, { "rowId": "B", - "query": "Namespace=aws/applicationelb metric=RequestCount Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + "query": "Namespace=aws/lambda metric=Invocations Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" }, { "rowId": "C", - "query": "#A * 100 / #B along loadbalancer, account, region, namespace" + "query": "#A * 100 / #B along functionname, account, region, namespace" } ], "triggers": [ @@ -1836,64 +1910,66 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - Multiple Failed Operations", - "description": "This alert fires when we detect multiple failed operations within a 15 minute interval for an ElastiCache service.", + "name": "AWS SQS - Queue has stopped receiving messages", + "description": "This alert fires when we detect that the queue has stopped receiving messages. That is, the average number of messages received in the queue <1 for an interval of 30 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/elasticache \"\\\"eventSource\\\":\\\"elasticache.amazonaws.com\\\"\" errorCode errorMessage\n| json \"eventSource\", \"errorCode\", \"errorMessage\", \"userIdentity\", \"requestParameters\", \"responseElements\" as event_source, error_code, error_message, user_identity, requestParameters, responseElements nodrop\n| json field=requestParameters \"cacheClusterId\" as req_cacheClusterId nodrop\n| json field=responseElements \"cacheClusterId\" as res_cacheClusterId nodrop\n| json field=user_identity \"arn\", \"userName\" nodrop \n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop\n| if (isEmpty(userName), user, userName) as user\n| if (isEmpty(req_cacheClusterId), res_cacheClusterId, req_cacheClusterId) as cacheclusterid\n| where event_source matches \"elasticache.amazonaws.com\" and !isEmpty(error_code) and !isEmpty(error_message) and !isEmpty(user)\n| count as event_count by _messageTime, account, region, event_source, error_code, error_message, user, cacheclusterid\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, event_source, error_code, error_message, user, cacheclusterid\n| fields -_messageTime" + "query": "metric=NumberOfMessagesReceived Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 10, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "timeRange": "-30m", + "threshold": 1, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 3 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 10, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "timeRange": "-30m", + "threshold": 1, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 3 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High CPU Utilization", - "description": "This alert fires when we detect that the average CPU utilization for a database is high (>=85%) for an interval of 5 minutes.", + "name": "AWS SQS - Message processing not fast enough", + "description": "This alert fires when we detect message processing is not fast enough. That is, the average approximate age of the oldest non-deleted message in the queue is more than 5 seconds for an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1903,89 +1979,89 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=CPUUtilization statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "metric=ApproximateAgeOfOldestMessage Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account,region,namespace,queuename " } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "threshold": 5, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "minDataPoints": 3 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "LessThan", - "field": null, + "threshold": 5, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "minDataPoints": 3 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - High Latency", - "description": "This alert fires when we detect that the average latency for a given Classic load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", + "name": "AWS Classic Load Balancer - Access from Highly Malicious Sources", + "description": "This alert fires when the Classic load balancer is accessed from highly malicious IP addresses within last 5 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/elb metric=Latency Statistic=Average account=* region=* loadbalancername=* | eval(_value*1000) | sum by account, region, namespace, loadbalancername" + "query": "account=* region=* namespace=aws/elb\n| parse \"* * * * * * * * * * * \\\"*\\\" \\\"*\\\" * *\" as datetime, loadbalancername, client, backend, request_processing_time, backend_processing_time, response_processing_time, elb_status_code, backend_status_code, received_bytes, sent_bytes, request, user_agent, ssl_cipher, ssl_protocol\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancername, account, region, namespace\n| threatlookup singleIndicator ClientIp\n| where _threatlookup.threat_type=\"ip_address\" and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as MaliciousConfidence\n| where MaliciousConfidence=\"high\"\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancername, account, region, namespace, MaliciousConfidence, Actor" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, - "thresholdType": "GreaterThanOrEqual", - "occurrenceType": "Always", - "minDataPoints": 2 + "threshold": 0, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, - "thresholdType": "LessThan", - "occurrenceType": "Always", - "minDataPoints": 2 + "threshold": 0, + "thresholdType": "LessThanOrEqual", + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - Low Burst Balance", - "description": "This alert fires when we observe a low burst balance (<= 50%) for a given database. A low burst balance indicates you won't be able to scale up as fast for burstable database workloads on gp2 volumes.", + "name": "AWS Network Load Balancer - High TLS Negotiation Errors", + "description": "This alert fires when we detect that there are too many TLS Negotiation Errors (>=10%) within an interval of 5 minutes for a given network load balancer", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1995,7 +2071,15 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=BurstBalance statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/NetworkELB metric=ClientTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/NetworkELB metric=TargetTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" + }, + { + "rowId": "C", + "query": "(#A + #B) along LoadBalancer, account, region, namespace" } ], "triggers": [ @@ -2004,8 +2088,8 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "LessThanOrEqual", + "threshold": 10, + "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", "triggerSource": "AnyTimeSeries", @@ -2016,20 +2100,23 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "GreaterThan", + "threshold": 10, + "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { "name": "Amazon Elasticache - High Engine CPU Utilization", @@ -2072,16 +2159,19 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Network Load Balancer - High Unhealthy Hosts", - "description": "This alert fires when we detect that are there are too many unhealthy hosts (>=10%) within an interval of 5 minutes for a given network load balancer", + "name": "Amazon RDS - Low Burst Balance", + "description": "This alert fires when we observe a low burst balance (<= 50%) for a given database. A low burst balance indicates you won't be able to scale up as fast for burstable database workloads on gp2 volumes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2091,15 +2181,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/NetworkELB metric=UnHealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/NetworkELB metric=HealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / (#A + #B) along LoadBalancer, AvailabilityZone, account, region, namespace" + "query": "Namespace=aws/rds metric=BurstBalance statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ @@ -2108,8 +2190,8 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "GreaterThanOrEqual", + "threshold": 50, + "thresholdType": "LessThanOrEqual", "field": null, "occurrenceType": "Always", "triggerSource": "AnyTimeSeries", @@ -2120,34 +2202,84 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "LessThan", + "threshold": 50, + "thresholdType": "GreaterThan", "field": null, "occurrenceType": "Always", "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - Access from Highly Malicious Sources", - "description": "This alert fires when an Application load balancer is accessed from highly malicious IP addresses within last 5 minutes", + "name": "AWS SQS - Messages not processed", + "description": "This alert fires when we detect messages that have been received by a consumer, but have not been processed (deleted/failed). That is, the average number of messages that are in flight are >=20 for an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "metric=ApproximateNumberOfMessagesNotVisible Statistic=avg region = * account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " + } + ], + "triggers": [ + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 20, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 3 + }, + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 20, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 3 + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "Amazon ECS - High CPU Utilization", + "description": "This alert fires when the average CPU utilization within a 5 minute interval for a service within a cluster is high (>=85%).", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/applicationelb\n| parse \"* * * * * * * * * * * * \\\"*\\\" \\\"*\\\" * * * \\\"*\\\"\" as Type, DateTime, loadbalancer, Client, Target, RequestProcessingTime, TargetProcessingTime, ResponseProcessingTime, ElbStatusCode, TargetStatusCode, ReceivedBytes, SentBytes, Request, UserAgent, SslCipher, SslProtocol, TargetGroupArn, TraceId\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancer, account, region, namespace\n| lookup type, actor, raw, threatlevel as MaliciousConfidence from sumo://threat/cs on threat=ClientIp \n| json field=raw \"labels[*].name\" as LabelName \n| replace(LabelName, \"\\\\/\",\"->\") as LabelName\n| replace(LabelName, \"\\\"\",\" \") as LabelName\n| where type=\"ip_address\" and MaliciousConfidence=\"high\"\n| if (isEmpty(actor), \"Unassigned\", actor) as Actor\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancer, account, region, namespace, MaliciousConfidence, Actor, LabelName" + "query": "Namespace=aws/ecs metric=CPUUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" } ], "triggers": [ @@ -2156,36 +2288,39 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", + "threshold": 85, + "thresholdType": "GreaterThanOrEqual", "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", + "minDataPoints": 2 }, { "detectionMethod": "StaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", + "threshold": 85, + "thresholdType": "LessThan", "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "occurrenceType": "Always", + "triggerSource": "AnyTimeSeries", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": true, + "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - High Latency", - "description": "This alert fires when we detect that the average latency for a given Application load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", + "name": "Amazon RDS - High Disk Queue Depth", + "description": "This alert fires when the average disk queue depth for a database is high (>=5) for an interval of 5 minutes. Higher this value, higher will be the number of outstanding I/Os (read/write requests) waiting to access the disk, which will impact the performance of your application.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2195,7 +2330,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/applicationelb metric=TargetResponseTime Statistic=Average account=* region=* loadbalancer=* | eval(_value*1000) | sum by account, region, namespace, loadbalancer" + "query": "Namespace=aws/rds metric=DiskQueueDepth statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ @@ -2204,7 +2339,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 5, "thresholdType": "GreaterThanOrEqual", "field": null, "occurrenceType": "Always", @@ -2216,7 +2351,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 5, "thresholdType": "LessThan", "field": null, "occurrenceType": "Always", @@ -2224,12 +2359,15 @@ "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": false, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] } ] } \ No newline at end of file From 17a48e3a09f49947ea02cba3363e23ada2790707 Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Wed, 10 Apr 2024 17:15:36 +0530 Subject: [PATCH 2/3] IP v4:v6 changes in TI for Tf monitors --- aws-observability-terraform/app-modules/alb/app.tf | 2 +- aws-observability-terraform/app-modules/elb/app.tf | 2 +- aws-observability-terraform/app-modules/sns/app.tf | 2 +- aws-observability-terraform/app-modules/sqs/app.tf | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/aws-observability-terraform/app-modules/alb/app.tf b/aws-observability-terraform/app-modules/alb/app.tf index ccdc3a43..cfad3a2e 100644 --- a/aws-observability-terraform/app-modules/alb/app.tf +++ b/aws-observability-terraform/app-modules/alb/app.tf @@ -27,7 +27,7 @@ module "alb_module" { monitor_is_disabled = var.monitors_disabled monitor_evaluation_delay = "0m" queries = { - A = "account=* region=* namespace=aws/applicationelb\n| parse \"* * * * * * * * * * * * \\\"*\\\" \\\"*\\\" * * * \\\"*\\\"\" as Type, DateTime, loadbalancer, Client, Target, RequestProcessingTime, TargetProcessingTime, ResponseProcessingTime, ElbStatusCode, TargetStatusCode, ReceivedBytes, SentBytes, Request, UserAgent, SslCipher, SslProtocol, TargetGroupArn, TraceId\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancer, account, region, namespace\n| threatlookup singleIndicator ClientIp\n| where _threatlookup.threat_type=\"ip_address\" and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as MaliciousConfidence\n| where MaliciousConfidence=\"high\"\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancer, account, region, namespace, MaliciousConfidence, Actor" + A = "account=* region=* namespace=aws/applicationelb\n| parse \"* * * * * * * * * * * * \\\"*\\\" \\\"*\\\" * * * \\\"*\\\"\" as Type, DateTime, loadbalancer, Client, Target, RequestProcessingTime, TargetProcessingTime, ResponseProcessingTime, ElbStatusCode, TargetStatusCode, ReceivedBytes, SentBytes, Request, UserAgent, SslCipher, SslProtocol, TargetGroupArn, TraceId\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancer, account, region, namespace\n| threatlookup singleIndicator ClientIp\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as MaliciousConfidence\n| where MaliciousConfidence=\"high\"\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancer, account, region, namespace, MaliciousConfidence, Actor" } triggers = [ { diff --git a/aws-observability-terraform/app-modules/elb/app.tf b/aws-observability-terraform/app-modules/elb/app.tf index 45cd9475..6519be97 100644 --- a/aws-observability-terraform/app-modules/elb/app.tf +++ b/aws-observability-terraform/app-modules/elb/app.tf @@ -25,7 +25,7 @@ module "classic_elb_module" { monitor_is_disabled = var.monitors_disabled monitor_evaluation_delay = "0m" queries = { - A = "account=* region=* namespace=aws/elb\n| parse \"* * * * * * * * * * * \\\"*\\\" \\\"*\\\" * *\" as datetime, loadbalancername, client, backend, request_processing_time, backend_processing_time, response_processing_time, elb_status_code, backend_status_code, received_bytes, sent_bytes, request, user_agent, ssl_cipher, ssl_protocol\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancername, account, region, namespace\n| threatlookup singleIndicator ClientIp\n| where _threatlookup.threat_type=\"ip_address\" and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as MaliciousConfidence\n| where MaliciousConfidence=\"high\"\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancername, account, region, namespace, MaliciousConfidence, Actor" + A = "account=* region=* namespace=aws/elb\n| parse \"* * * * * * * * * * * \\\"*\\\" \\\"*\\\" * *\" as datetime, loadbalancername, client, backend, request_processing_time, backend_processing_time, response_processing_time, elb_status_code, backend_status_code, received_bytes, sent_bytes, request, user_agent, ssl_cipher, ssl_protocol\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancername, account, region, namespace\n| threatlookup singleIndicator ClientIp\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as MaliciousConfidence\n| where MaliciousConfidence=\"high\"\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancername, account, region, namespace, MaliciousConfidence, Actor" } triggers = [ { diff --git a/aws-observability-terraform/app-modules/sns/app.tf b/aws-observability-terraform/app-modules/sns/app.tf index c819b52b..c9fb38c4 100644 --- a/aws-observability-terraform/app-modules/sns/app.tf +++ b/aws-observability-terraform/app-modules/sns/app.tf @@ -97,7 +97,7 @@ module "sns_module" { monitor_is_disabled = var.monitors_disabled monitor_evaluation_delay = "0m" queries = { - A = "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop\n| where event_source = \"sns.amazonaws.com\" \n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, user_type, arn, username nodrop\n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountid, user nodrop\n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop\n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn\n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn\n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop\n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop\n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname\n| if (isBlank(accountid), recipient_account_id, accountid) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status\n| if (isEmpty(username), user_type, username) as user_type\n| count as ip_count by src_ip, event_name, region, accountid,user_type\n| threatlookup singleIndicator src_ip\n| where _threatlookup.threat_type=\"ip_address\" and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as malicious_confidence\n| where malicious_confidence=\"high\"\n| sum(ip_count) as threat_count by src_ip, event_name, region, accountid, malicious_confidence, actor" + A = "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop\n| where event_source = \"sns.amazonaws.com\" \n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, user_type, arn, username nodrop\n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountid, user nodrop\n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop\n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn\n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn\n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop\n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop\n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname\n| if (isBlank(accountid), recipient_account_id, accountid) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status\n| if (isEmpty(username), user_type, username) as user_type\n| count as ip_count by src_ip, event_name, region, accountid,user_type\n| threatlookup singleIndicator src_ip\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as malicious_confidence\n| where malicious_confidence=\"high\"\n| sum(ip_count) as threat_count by src_ip, event_name, region, accountid, malicious_confidence, actor" } triggers = [ diff --git a/aws-observability-terraform/app-modules/sqs/app.tf b/aws-observability-terraform/app-modules/sqs/app.tf index ae9b7caf..dad43377 100644 --- a/aws-observability-terraform/app-modules/sqs/app.tf +++ b/aws-observability-terraform/app-modules/sqs/app.tf @@ -97,7 +97,7 @@ module "sqs_module" { monitor_is_disabled = var.monitors_disabled monitor_evaluation_delay = "0m" queries = { - A = "account=* region=* namespace=\"aws/sqs\" eventname eventsource \"sqs.amazonaws.com\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"sourceIPAddress\",\"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, recipient_account_id, requestParameters, responseElements, src_ip, error_code, error_message nodrop\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| json field=requestParameters \"queueUrl\" as queueUrlReq nodrop \n| json field=responseElements \"queueUrl\" as queueUrlRes nodrop\n| where event_source=\"sqs.amazonaws.com\" and !(src_ip matches \"*.amazonaws.com\")\n| if(event_name=\"CreateQueue\", queueUrlRes, queueUrlReq) as queueUrl \n| parse regex field=queueUrl \"(?[^\\/]*$)\"\n| if (isBlank(recipient_account_id), accountid, recipient_account_id) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| count as ip_count by src_ip\n| threatlookup singleIndicator src_ip\n| where _threatlookup.threat_type=\"ip_address\" and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as malicious_confidence\n| where malicious_confidence=\"high\"\n| sort by ip_count, src_ip\n| fields src_ip, malicious_confidence, actor" + A = "account=* region=* namespace=\"aws/sqs\" eventname eventsource \"sqs.amazonaws.com\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"sourceIPAddress\",\"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, recipient_account_id, requestParameters, responseElements, src_ip, error_code, error_message nodrop\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| json field=requestParameters \"queueUrl\" as queueUrlReq nodrop \n| json field=responseElements \"queueUrl\" as queueUrlRes nodrop\n| where event_source=\"sqs.amazonaws.com\" and !(src_ip matches \"*.amazonaws.com\")\n| if(event_name=\"CreateQueue\", queueUrlRes, queueUrlReq) as queueUrl \n| parse regex field=queueUrl \"(?[^\\/]*$)\"\n| if (isBlank(recipient_account_id), accountid, recipient_account_id) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| count as ip_count by src_ip\n| threatlookup singleIndicator src_ip\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as malicious_confidence\n| where malicious_confidence=\"high\"\n| sort by ip_count, src_ip\n| fields src_ip, malicious_confidence, actor" } triggers = [ { From 49f572f87a5cea463a34f921af0f1d4be34ccef8 Mon Sep 17 00:00:00 2001 From: Himanshu Sharma Date: Wed, 10 Apr 2024 17:16:02 +0530 Subject: [PATCH 3/3] IP v4:v6 changes in TI for cf monitors --- .../aws-observability/json/Alerts-App.json | 1736 +++++++++++------ aws-observability/json/Alerts-App.json | 1140 +++++------ 2 files changed, 1628 insertions(+), 1248 deletions(-) diff --git a/aws-observability-terraform/examples/aws-observability/json/Alerts-App.json b/aws-observability-terraform/examples/aws-observability/json/Alerts-App.json index 2ecae908..fb96a1e4 100644 --- a/aws-observability-terraform/examples/aws-observability/json/Alerts-App.json +++ b/aws-observability-terraform/examples/aws-observability/json/Alerts-App.json @@ -1,13 +1,13 @@ { "name": "AWS Observability", - "description": "", + "description": "This folder contains all the monitors for AWS Observability solution.", "type": "MonitorsLibraryFolderExport", "children": [ { - "name": "AWS API Gateway - High Latency", - "description": "This alert fires when we detect that the average latency for a given API Gateway is greater than or equal to one second for 5 minutes.", + "name": "AWS SQS - Access from Highly Malicious Sources", + "description": "This alert fires when an Application AWS - SQS is accessed from highly malicious IP addresses within last 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", + "monitorType": "Logs", "evaluationDelay": "0m", "alertName": null, "runAs": null, @@ -15,39 +15,42 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway metric=Latency statistic=Average account=* region=* apiname=* | avg by apiname, namespace, region, account" + "query": "account=* region=* namespace=\"aws/sqs\" eventname eventsource \"sqs.amazonaws.com\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"sourceIPAddress\",\"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, recipient_account_id, requestParameters, responseElements, src_ip, error_code, error_message nodrop\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| json field=requestParameters \"queueUrl\" as queueUrlReq nodrop \n| json field=responseElements \"queueUrl\" as queueUrlRes nodrop\n| where event_source=\"sqs.amazonaws.com\" and !(src_ip matches \"*.amazonaws.com\")\n| if(event_name=\"CreateQueue\", queueUrlRes, queueUrlReq) as queueUrl \n| parse regex field=queueUrl \"(?[^\\/]*$)\"\n| if (isBlank(recipient_account_id), accountid, recipient_account_id) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| count as ip_count by src_ip\n| threatlookup singleIndicator src_ip\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as malicious_confidence\n| where malicious_confidence=\"high\"\n| sort by ip_count, src_ip\n| fields src_ip, malicious_confidence, actor" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "threshold": 0, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "threshold": 0, + "thresholdType": "LessThanOrEqual", + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High Engine CPU Utilization", - "description": "This alert fires when the average CPU utilization for the Redis engine process within a 5 minute interval is high (>=90%). For larger node types with four vCPUs or more, use the EngineCPUUtilization metric to monitor and set thresholds for scaling.", + "name": "AWS EC2 - High Total CPU Utilization", + "description": "This alert fires when the average total CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "0m", @@ -57,627 +60,720 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=EngineCPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" + "query": "Namespace=aws/ec2 metric=CPU_Total account=* region=* instanceid=* | avg by account, region, namespace, instanceid" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, + "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, + "threshold": 85, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High Memory Utilization", - "description": "This alert fires when the average memory utilization within a 5 minute interval for an EC2 instance is high (>=85%).", + "name": "Amazon RDS - High Write Latency", + "description": "This alert fires when the average write latency of a database within a 5 minute interval is high (>=5 seconds) . High write latencies will affect the performance of your application.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=Mem_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid" + "query": "Namespace=aws/rds metric=WriteLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - Low Redis Cache Hit Rate", - "description": "This alert fires when the average cache hit rate for Redis within a 5 minute interval is low (<= 80%). This indicates low efficiency of the Redis instance. If cache ratio is lower than 80%, that indicates a significant amount of keys are either evicted, expired, or don't exist.", + "name": "Amazon RDS - Low Aurora Buffer Cache Hit Ratio", + "description": "This alert fires when the average RDS Aurora buffer cache hit ratio within a 5 minute interval is low (<= 50%). This indicates that a lower percentage of requests were are served by the buffer cache, which could further indicate a degradation in application performance.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=CacheHitRate statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" + "query": "Namespace=aws/rds metric=BufferCacheHitRatio statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 50, "thresholdType": "LessThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 50, "thresholdType": "GreaterThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High 5XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", + "name": "Amazon Elasticache - High Redis Memory Fragmentation Ratio", + "description": "This alert fires when the average Redis memory fragmentation ratio for within a 5 minute interval is high (>=1.5). Value equal to or greater than 1.5 Indicate significant memory fragmentation.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway metric=5xxError Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/apigateway metric=count Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along account, region, namespace" + "query": "Namespace=aws/elasticache metric=MemoryFragmentationRatio statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 1.5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 1.5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - High Latency", - "description": "This alert fires when we detect that the average latency for a given Application load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", + "name": "Amazon ECS - High CPU Utilization", + "description": "This alert fires when the average CPU utilization within a 5 minute interval for a service within a cluster is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/applicationelb metric=TargetResponseTime Statistic=Average account=* region=* loadbalancer=* | eval(_value*1000) | sum by account, region, namespace, loadbalancer" + "query": "Namespace=aws/ecs metric=CPUUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 85, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon ECS - High Memory Utilization", - "description": "This alert fires when the average memory utilization within a 5 minute interval for a service within a cluster is high (>=85%).", + "name": "AWS SNS - Failed Notifications", + "description": "This alert fires where there are many failed notifications (>=5) within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ecs metric=MemoryUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" + "query": "account=* region=* namespace=aws/sns TopicName=* metric=NumberOfNotificationsFailed Statistic=Sum \n| sum by account, region, TopicName" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "threshold": 2, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "LessThan", - "field": null, + "threshold": 2, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High Write Latency", - "description": "This alert fires when the average write latency of a database within a 5 minute interval is high (>=5 seconds) . High write latencies will affect the performance of your application.", + "name": "Amazon RDS - High Read Latency", + "description": "This alert fires when the average read latency of a database within a 5 minutes time inerval is high (>=5 seconds). High read latency will affect the performance of your application.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=WriteLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/rds metric=ReadLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - High 4XX Errors", + "name": "AWS Classic Load Balancer - High 4XX Errors", "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_4XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + "query": "Namespace=aws/elb metric=HTTPCode_ELB_4XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" }, { "rowId": "B", - "query": "Namespace=aws/applicationelb metric=RequestCount Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" }, { "rowId": "C", - "query": "#A * 100 / #B along loadbalancer, account, region, namespace" + "query": "#A * 100 / #B along loadbalancername, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - Low Burst Balance", - "description": "This alert fires when we observe a low burst balance (<= 50%) for a given database. A low burst balance indicates you won't be able to scale up as fast for burstable database workloads on gp2 volumes.", + "name": "Amazon Elasticache - High Redis Database Memory Usage", + "description": "This alert fires when the average database memory usage within a 5 minute interval for the Redis engine is high (>=95%). When the value reaches 100%, eviction may happen or write operations may fail based on ElastiCache policies thereby impacting application performance.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=BurstBalance statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/elasticache metric=DatabaseMemoryUsagePercentage statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "LessThanOrEqual", - "field": null, + "threshold": 95, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "GreaterThan", - "field": null, + "threshold": 95, + "thresholdType": "LessThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - Access from Highly Malicious Sources", - "description": "This alert fires when the Classic load balancer is accessed from highly malicious IP addresses within last 5 minutes.", + "name": "AWS SQS - Messages not processed", + "description": "This alert fires when we detect messages that have been received by a consumer, but have not been processed (deleted/failed). That is, the average number of messages that are in flight are >=20 for an interval of 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/elb\n| parse \"* * * * * * * * * * * \\\"*\\\" \\\"*\\\" * *\" as datetime, loadbalancername, client, backend, request_processing_time, backend_processing_time, response_processing_time, elb_status_code, backend_status_code, received_bytes, sent_bytes, request, user_agent, ssl_cipher, ssl_protocol\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancername, account, region, namespace\n| lookup type, actor, raw, threatlevel as MaliciousConfidence from sumo://threat/cs on threat=ClientIp \n| json field=raw \"labels[*].name\" as LabelName \n| replace(LabelName, \"\\\\/\",\"->\") as LabelName\n| replace(LabelName, \"\\\"\",\" \") as LabelName\n| where type=\"ip_address\" and MaliciousConfidence=\"high\"\n| if (isEmpty(actor), \"Unassigned\", actor) as Actor\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancername, account, region, namespace, MaliciousConfidence, Actor, LabelName" + "query": "metric=ApproximateNumberOfMessagesNotVisible Statistic=avg region = * account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", - "field": "" + "threshold": 20, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", - "field": "" + "threshold": 20, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, - "playbook": "" + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - High 5XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", + "name": "AWS DynamoDB - High Max Provisioned Table Read Capacity", + "description": "This alert fires when we detect that the average percentage of read provisioned capacity used by the highest read provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/elb metric=HTTPCode_ELB_5XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along loadbalancername, account, region, namespace" + "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" } ], "triggers": [ { "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 80, "thresholdType": "GreaterThanOrEqual", - "occurrenceType": "Always" + "occurrenceType": "Always", + "minDataPoints": 2 }, { "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 80, "thresholdType": "LessThan", - "occurrenceType": "Always" + "occurrenceType": "Always", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Network Load Balancer - High Unhealthy Hosts", - "description": "This alert fires when we detect that are there are too many unhealthy hosts (>=10%) within an interval of 5 minutes for a given network load balancer", + "name": "AWS DynamoDB - High Max Provisioned Table Write Capacity", + "description": "This alert fires when we detect that the average percentage of write provisioned capacity used by the highest write provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/NetworkELB metric=UnHealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/NetworkELB metric=HealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / (#A + #B) along LoadBalancer, AvailabilityZone, account, region, namespace" + "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, + "threshold": 80, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, + "threshold": 80, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High Integration Latency", - "description": "This alert fires when we detect that the average integration latency for a given API Gateway is greater than or equal to one second for 5 minutes.", + "name": "Amazon RDS - Low Burst Balance", + "description": "This alert fires when we observe a low burst balance (<= 50%) for a given database. A low burst balance indicates you won't be able to scale up as fast for burstable database workloads on gp2 volumes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway metric=IntegrationLatency statistic=Average account=* region=* apiname=* | avg by apiname, namespace, region, account" + "query": "Namespace=aws/rds metric=BurstBalance statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "threshold": 50, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, - "thresholdType": "LessThan", - "field": null, + "threshold": 50, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Lambda - High Percentage of Failed Requests", - "description": "This alert fires when we detect a large number of failed Lambda requests (>5%) within an interval of 5 minutes.", + "name": "AWS SQS - Queue has stopped receiving messages", + "description": "This alert fires when we detect that the queue has stopped receiving messages. That is, the average number of messages received in the queue <1 for an interval of 30 minutes", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/lambda metric=Errors Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/lambda metric=Invocations Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along functionname, account, region, namespace" + "query": "metric=NumberOfMessagesReceived Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", - "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "resolutionWindow": null, + "timeRange": "-30m", + "threshold": 1, + "thresholdType": "LessThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", - "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThan", - "field": null, + "resolutionWindow": null, + "timeRange": "-30m", + "threshold": 1, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - High 5XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", + "name": "Amazon RDS - High Disk Queue Depth", + "description": "This alert fires when the average disk queue depth for a database is high (>=5) for an interval of 5 minutes. Higher this value, higher will be the number of outstanding I/Os (read/write requests) waiting to access the disk, which will impact the performance of your application.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_5XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + "query": "Namespace=aws/rds metric=DiskQueueDepth statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + } + ], + "triggers": [ + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 5, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 + }, + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 5, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "AWS Application Load Balancer - High 5XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Metrics", + "evaluationDelay": "4m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_5XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" }, { "rowId": "B", @@ -690,36 +786,41 @@ ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High Redis Memory Fragmentation Ratio", - "description": "This alert fires when the average Redis memory fragmentation ratio for within a 5 minute interval is high (>=1.5). Value equal to or greater than 1.5 Indicate significant memory fragmentation.", + "name": "AWS SNS - Failed Events", + "description": "This alert fires when an SNS app has high number of failed events (>5) within last 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", + "monitorType": "Logs", "evaluationDelay": "0m", "alertName": null, "runAs": null, @@ -727,253 +828,591 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=MemoryFragmentationRatio statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" + "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" errorCode \n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop \n| where event_source = \"sns.amazonaws.com\" and !isblank(error_code) \n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop \n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop \n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop \n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn \n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn \n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop \n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop \n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname \n| if (isBlank(accountid), recipient_account_id, accountid) as accountid \n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| if (isEmpty(username), user, username) as user \n| count as event_count by event_name, error_code, error_message, region, src_ip, accountid, user, type, request_id, topicname, topic_arn, user_agent" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1.5, + "threshold": 5, + "thresholdType": "GreaterThan", + "field": null + }, + { + "detectionMethod": "LogsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 5, + "thresholdType": "LessThanOrEqual", + "field": null + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "AWS API Gateway - High 4XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Metrics", + "evaluationDelay": "4m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "Namespace=aws/apigateway metric=4xxError Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/apigateway metric=count Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along apiname, account, region, namespace" + } + ], + "triggers": [ + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1.5, + "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High CPU Utilization", - "description": "This alert fires when the average CPU utilization within a 5 minute interval for a host is high (>=90%). The CPUUtilization metric includes total CPU utilization across application, operating system and management processes. We highly recommend monitoring CPU utilization for hosts with two vCPUs or less.", + "name": "AWS Network Load Balancer - High TLS Negotiation Errors", + "description": "This alert fires when we detect that there are too many TLS Negotiation Errors (>=10%) within an interval of 5 minutes for a given network load balancer", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=CPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" + "query": "Namespace=aws/NetworkELB metric=ClientTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/NetworkELB metric=TargetTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" + }, + { + "rowId": "C", + "query": "(#A + #B) along LoadBalancer, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, + "threshold": 10, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, + "threshold": 10, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - Low Aurora Buffer Cache Hit Ratio", - "description": "This alert fires when the average RDS Aurora buffer cache hit ratio within a 5 minute interval is low (<= 50%). This indicates that a lower percentage of requests were are served by the buffer cache, which could further indicate a degradation in application performance.", + "name": "AWS DynamoDB - High Account Provisioned Write Capacity", + "description": "This alert fires when we detect that the average write capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=BufferCacheHitRatio statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/dynamodb metric=AccountProvisionedWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + } + ], + "triggers": [ + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 80, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 + }, + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 80, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "AWS API Gateway - High Integration Latency", + "description": "This alert fires when we detect that the average integration latency for a given API Gateway is greater than or equal to one second for 5 minutes.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Metrics", + "evaluationDelay": "4m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "Namespace=aws/apigateway metric=IntegrationLatency statistic=Average account=* region=* apiname=* | avg by apiname, namespace, region, account" + } + ], + "triggers": [ + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 1000, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 + }, + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 1000, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "AWS Classic Load Balancer - High Latency", + "description": "This alert fires when we detect that the average latency for a given Classic load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Metrics", + "evaluationDelay": "4m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "Namespace=aws/elb metric=Latency Statistic=Average account=* region=* loadbalancername=* | eval(_value*1000) | sum by account, region, namespace, loadbalancername" + } + ], + "triggers": [ + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 3000, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 + }, + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 3000, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "AWS Application Load Balancer - High Latency", + "description": "This alert fires when we detect that the average latency for a given Application load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Metrics", + "evaluationDelay": "4m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "Namespace=aws/applicationelb metric=TargetResponseTime Statistic=Average account=* region=* loadbalancer=* | eval(_value*1000) | sum by account, region, namespace, loadbalancer" + } + ], + "triggers": [ + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 3000, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 + }, + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 3000, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "AWS DynamoDB - High Account Provisioned Read Capacity", + "description": "This alert fires when we detect that the average read capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Metrics", + "evaluationDelay": "4m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "Namespace=aws/dynamodb metric=AccountProvisionedReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + } + ], + "triggers": [ + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 80, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 + }, + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 80, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "AWS Lambda - Low Provisioned Concurrency Utilization", + "description": "This alert fires when the average provisioned concurrency utilization for 5 minutes is low (<= 50%). This indicates low provisioned concurrency utilization efficiency.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Metrics", + "evaluationDelay": "4m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "Namespace=aws/lambda metric=ProvisionedConcurrencyUtilization statistic=Average account=* region=* functionname=* | avg by functionname, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 50, "thresholdType": "LessThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 50, "thresholdType": "GreaterThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High 4XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", + "name": "AWS Network Load Balancer - High Unhealthy Hosts", + "description": "This alert fires when we detect that are there are too many unhealthy hosts (>=10%) within an interval of 5 minutes for a given network load balancer", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway metric=4xxError Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + "query": "Namespace=aws/NetworkELB metric=UnHealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" }, { "rowId": "B", - "query": "Namespace=aws/apigateway metric=count Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + "query": "Namespace=aws/NetworkELB metric=HealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" }, { "rowId": "C", - "query": "#A * 100 / #B along apiname, account, region, namespace" + "query": "#A * 100 / (#A + #B) along LoadBalancer, AvailabilityZone, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 10, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 10, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High Total CPU Utilization", - "description": "This alert fires when the average total CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", + "name": "AWS Classic Load Balancer - High 5XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=CPU_Total account=* region=* instanceid=* | avg by account, region, namespace, instanceid" + "query": "Namespace=aws/elb metric=HTTPCode_ELB_5XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along loadbalancername, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - High Latency", - "description": "This alert fires when we detect that the average latency for a given Classic load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", + "name": "Amazon Elasticache - High Engine CPU Utilization", + "description": "This alert fires when the average CPU utilization for the Redis engine process within a 5 minute interval is high (>=90%). For larger node types with four vCPUs or more, use the EngineCPUUtilization metric to monitor and set thresholds for scaling.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/elb metric=Latency Statistic=Average account=* region=* loadbalancername=* | eval(_value*1000) | sum by account, region, namespace, loadbalancername" + "query": "Namespace=aws/elasticache metric=EngineCPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" } ], "triggers": [ { "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 90, "thresholdType": "GreaterThanOrEqual", - "occurrenceType": "Always" + "occurrenceType": "Always", + "minDataPoints": 2 }, { "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 90, "thresholdType": "LessThan", - "occurrenceType": "Always" + "occurrenceType": "Always", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High Disk Utilization", - "description": "This alert fires when the average disk utilization within a 5 minute time interval for an EC2 instance is high (>=85%).", + "name": "AWS EC2 - High Memory Utilization", + "description": "This alert fires when the average memory utilization within a 5 minute interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "0m", @@ -983,131 +1422,146 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=Disk_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid, devname" + "query": "Namespace=aws/ec2 metric=Mem_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 85, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon ECS - High CPU Utilization", - "description": "This alert fires when the average CPU utilization within a 5 minute interval for a service within a cluster is high (>=85%).", + "name": "Amazon ECS - High Memory Utilization", + "description": "This alert fires when the average memory utilization within a 5 minute interval for a service within a cluster is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ecs metric=CPUUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" + "query": "Namespace=aws/ecs metric=MemoryUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 85, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Network Load Balancer - High TLS Negotiation Errors", - "description": "This alert fires when we detect that there are too many TLS Negotiation Errors (>=10%) within an interval of 5 minutes for a given network load balancer", + "name": "AWS Lambda - High Percentage of Failed Requests", + "description": "This alert fires when we detect a large number of failed Lambda requests (>5%) within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/NetworkELB metric=ClientTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" + "query": "Namespace=aws/lambda metric=Errors Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" }, { "rowId": "B", - "query": "Namespace=aws/NetworkELB metric=TargetTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" + "query": "Namespace=aws/lambda metric=Invocations Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" }, { "rowId": "C", - "query": "(#A + #B) along LoadBalancer, account, region, namespace" + "query": "#A * 100 / #B along functionname, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, + "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, + "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - Multiple Failed Operations", - "description": "This alert fires when we detect multiple failed operations within a 15 minute interval for an ElastiCache service.", + "name": "AWS DynamoDB - Multiple Tables deleted", + "description": "This alert fires when five or more tables are deleted within 15 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Logs", "evaluationDelay": "0m", @@ -1117,41 +1571,44 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/elasticache \"\\\"eventSource\\\":\\\"elasticache.amazonaws.com\\\"\" errorCode errorMessage\n| json \"eventSource\", \"errorCode\", \"errorMessage\", \"userIdentity\", \"requestParameters\", \"responseElements\" as event_source, error_code, error_message, user_identity, requestParameters, responseElements nodrop\n| json field=requestParameters \"cacheClusterId\" as req_cacheClusterId nodrop\n| json field=responseElements \"cacheClusterId\" as res_cacheClusterId nodrop\n| json field=user_identity \"arn\", \"userName\" nodrop \n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop\n| if (isEmpty(userName), user, userName) as user\n| if (isEmpty(req_cacheClusterId), res_cacheClusterId, req_cacheClusterId) as cacheclusterid\n| where event_source matches \"elasticache.amazonaws.com\" and !isEmpty(error_code) and !isEmpty(error_message) and !isEmpty(user)\n| count as event_count by _messageTime, account, region, event_source, error_code, error_message, user, cacheclusterid\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, event_source, error_code, error_message, user, cacheclusterid\n| fields -_messageTime" + "query": "account=* region=* namespace=aws/dynamodb eventSource \"dynamodb.amazonaws.com\"\n| json \"eventSource\", \"eventName\", \"requestParameters.tableName\", \"sourceIPAddress\", \"userIdentity.userName\", \"userIdentity.sessionContext.sessionIssuer.userName\" as event_source, event_name, tablename, SourceIp, UserName, ContextUserName nodrop\n| where event_source = \"dynamodb.amazonaws.com\" and event_name = \"DeleteTable\"\n| if (isEmpty(UserName), ContextUserName, UserName) as user\n| count by _messageTime, account, region, namespace, event_name, user, tablename\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, namespace, event_name, user, tablename\n| fields -_messageTime" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-15m", - "threshold": 10, + "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults" + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-15m", - "threshold": 10, + "threshold": 5, "thresholdType": "LessThan", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults" + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, - "playbook": "" + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High Redis Database Memory Usage", - "description": "This alert fires when the average database memory usage within a 5 minute interval for the Redis engine is high (>=95%). When the value reaches 100%, eviction may happen or write operations may fail based on ElastiCache policies thereby impacting application performance.", + "name": "Amazon Elasticache - Multiple Failed Operations", + "description": "This alert fires when we detect multiple failed operations within a 15 minute interval for an ElastiCache service.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", + "monitorType": "Logs", "evaluationDelay": "0m", "alertName": null, "runAs": null, @@ -1159,83 +1616,91 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=DatabaseMemoryUsagePercentage statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" + "query": "account=* region=* namespace=aws/elasticache \"\\\"eventSource\\\":\\\"elasticache.amazonaws.com\\\"\" errorCode errorMessage\n| json \"eventSource\", \"errorCode\", \"errorMessage\", \"userIdentity\", \"requestParameters\", \"responseElements\" as event_source, error_code, error_message, user_identity, requestParameters, responseElements nodrop\n| json field=requestParameters \"cacheClusterId\" as req_cacheClusterId nodrop\n| json field=responseElements \"cacheClusterId\" as res_cacheClusterId nodrop\n| json field=user_identity \"arn\", \"userName\" nodrop \n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop\n| if (isEmpty(userName), user, userName) as user\n| if (isEmpty(req_cacheClusterId), res_cacheClusterId, req_cacheClusterId) as cacheclusterid\n| where event_source matches \"elasticache.amazonaws.com\" and !isEmpty(error_code) and !isEmpty(error_message) and !isEmpty(user)\n| count as event_count by _messageTime, account, region, event_source, error_code, error_message, user, cacheclusterid\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, event_source, error_code, error_message, user, cacheclusterid\n| fields -_messageTime" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", - "timeRange": "-5m", - "threshold": 95, + "resolutionWindow": null, + "timeRange": "-15m", + "threshold": 10, "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", - "timeRange": "-5m", - "threshold": 95, + "resolutionWindow": null, + "timeRange": "-15m", + "threshold": 10, "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High System CPU Utilization", - "description": "This alert fires when the average system CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", + "name": "Amazon RDS - High CPU Utilization", + "description": "This alert fires when we detect that the average CPU utilization for a database is high (>=85%) for an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=CPU_Sys account=* region=* instanceid=* | avg by account, region, namespace, instanceid" + "query": "Namespace=aws/rds metric=CPUUtilization statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 85, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Lambda - Low Provisioned Concurrency Utilization", - "description": "This alert fires when the average provisioned concurrency utilization for 5 minutes is low (<= 50%). This indicates low provisioned concurrency utilization efficiency.", + "name": "AWS Classic Load Balancer - Access from Highly Malicious Sources", + "description": "This alert fires when the Classic load balancer is accessed from highly malicious IP addresses within last 5 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", + "monitorType": "Logs", "evaluationDelay": "0m", "alertName": null, "runAs": null, @@ -1243,41 +1708,44 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/lambda metric=ProvisionedConcurrencyUtilization statistic=Average account=* region=* functionname=* | avg by functionname, namespace, region, account" + "query": "account=* region=* namespace=aws/elb\n| parse \"* * * * * * * * * * * \\\"*\\\" \\\"*\\\" * *\" as datetime, loadbalancername, client, backend, request_processing_time, backend_processing_time, response_processing_time, elb_status_code, backend_status_code, received_bytes, sent_bytes, request, user_agent, ssl_cipher, ssl_protocol\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancername, account, region, namespace\n| threatlookup singleIndicator ClientIp\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as MaliciousConfidence\n| where MaliciousConfidence=\"high\"\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancername, account, region, namespace, MaliciousConfidence, Actor" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "LessThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "threshold": 0, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "GreaterThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "threshold": 0, + "thresholdType": "LessThanOrEqual", + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Max Provisioned Table Write Capacity", - "description": "This alert fires when we detect that the average percentage of write provisioned capacity used by the highest write provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "AWS SNS - Access from Highly Malicious Sources", + "description": "This alert fires when an Application AWS - SNS is accessed from highly malicious IP addresses within last 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", + "monitorType": "Logs", "evaluationDelay": "0m", "alertName": null, "runAs": null, @@ -1285,83 +1753,99 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop\n| where event_source = \"sns.amazonaws.com\" \n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, user_type, arn, username nodrop\n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountid, user nodrop\n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop\n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn\n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn\n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop\n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop\n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname\n| if (isBlank(accountid), recipient_account_id, accountid) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status\n| if (isEmpty(username), user_type, username) as user_type\n| count as ip_count by src_ip, event_name, region, accountid,user_type\n| threatlookup singleIndicator src_ip\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as malicious_confidence\n| where malicious_confidence=\"high\"\n| sum(ip_count) as threat_count by src_ip, event_name, region, accountid, malicious_confidence, actor" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "threshold": 0, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "threshold": 0, + "thresholdType": "LessThanOrEqual", + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Max Provisioned Table Read Capacity", - "description": "This alert fires when we detect that the average percentage of read provisioned capacity used by the highest read provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "AWS Application Load Balancer - High 4XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_4XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/applicationelb metric=RequestCount Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along loadbalancer, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - Multiple Tables deleted", - "description": "This alert fires when we detect multiple failed operations for Elasticache service within 15 minutes", + "name": "AWS EC2 - High System CPU Utilization", + "description": "This alert fires when the average system CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", + "monitorType": "Metrics", "evaluationDelay": "0m", "alertName": null, "runAs": null, @@ -1369,255 +1853,289 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/dynamodb eventSource \"dynamodb.amazonaws.com\"\n| json \"eventSource\", \"eventName\", \"requestParameters.tableName\", \"sourceIPAddress\", \"userIdentity.userName\", \"userIdentity.sessionContext.sessionIssuer.userName\" as event_source, event_name, tablename, SourceIp, UserName, ContextUserName nodrop\n| where event_source = \"dynamodb.amazonaws.com\" and event_name = \"DeleteTable\"\n| if (isEmpty(UserName), ContextUserName, UserName) as user\n| count by _messageTime, account, region, namespace, event_name, user, tablename\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, namespace, event_name, user, tablename\n| fields -_messageTime" + "query": "Namespace=aws/ec2 metric=CPU_Sys account=* region=* instanceid=* | avg by account, region, namespace, instanceid" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", - "timeRange": "-15m", - "threshold": 5, + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults" + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", - "timeRange": "-15m", - "threshold": 5, + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 85, "thresholdType": "LessThan", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults" + "occurrenceType": "Always", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, - "playbook": "" + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Account Provisioned Write Capacity", - "description": "This alert fires when we detect that the average write capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "AWS API Gateway - High Latency", + "description": "This alert fires when we detect that the average latency for a given API Gateway is greater than or equal to one second for 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=AccountProvisionedWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "Namespace=aws/apigateway metric=Latency statistic=Average account=* region=* apiname=* | avg by apiname, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 1000, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 1000, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High Read Latency", - "description": "This alert fires when the average read latency of a database within a 5 minutes time inerval is high (>=5 seconds). High read latency will affect the performance of your application.", + "name": "Amazon Elasticache - High CPU Utilization", + "description": "This alert fires when the average CPU utilization within a 5 minute interval for a host is high (>=90%). The CPUUtilization metric includes total CPU utilization across application, operating system and management processes. We highly recommend monitoring CPU utilization for hosts with two vCPUs or less.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=ReadLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/elasticache metric=CPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 90, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 90, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - Access from Highly Malicious Sources", - "description": "This alert fires when an Application load balancer is accessed from highly malicious IP addresses within last 5 minutes", + "name": "AWS API Gateway - High 5XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/applicationelb\n| parse \"* * * * * * * * * * * * \\\"*\\\" \\\"*\\\" * * * \\\"*\\\"\" as Type, DateTime, loadbalancer, Client, Target, RequestProcessingTime, TargetProcessingTime, ResponseProcessingTime, ElbStatusCode, TargetStatusCode, ReceivedBytes, SentBytes, Request, UserAgent, SslCipher, SslProtocol, TargetGroupArn, TraceId\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancer, account, region, namespace\n| lookup type, actor, raw, threatlevel as MaliciousConfidence from sumo://threat/cs on threat=ClientIp \n| json field=raw \"labels[*].name\" as LabelName \n| replace(LabelName, \"\\\\/\",\"->\") as LabelName\n| replace(LabelName, \"\\\"\",\" \") as LabelName\n| where type=\"ip_address\" and MaliciousConfidence=\"high\"\n| if (isEmpty(actor), \"Unassigned\", actor) as Actor\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancer, account, region, namespace, MaliciousConfidence, Actor, LabelName" + "query": "Namespace=aws/apigateway metric=5xxError Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/apigateway metric=count Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults" + "threshold": 5, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults" + "threshold": 5, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, - "playbook": "" + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - High 4XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", + "name": "Amazon Elasticache - Low Redis Cache Hit Rate", + "description": "This alert fires when the average cache hit rate for Redis within a 5 minute interval is low (<= 80%). This indicates low efficiency of the Redis instance. If cache ratio is lower than 80%, that indicates a significant amount of keys are either evicted, expired, or don't exist.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/elb metric=HTTPCode_ELB_4XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along loadbalancername, account, region, namespace" + "query": "Namespace=aws/elasticache metric=CacheHitRate statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" } ], "triggers": [ { "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThanOrEqual", - "occurrenceType": "Always" + "threshold": 80, + "thresholdType": "LessThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 }, { "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThan", - "occurrenceType": "Always" + "threshold": 80, + "thresholdType": "GreaterThan", + "occurrenceType": "Always", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High CPU Utilization", - "description": "This alert fires when we detect that the average CPU utilization for a database is high (>=85%) for an interval of 5 minutes.", + "name": "AWS SQS - Message processing not fast enough", + "description": "This alert fires when we detect message processing is not fast enough. That is, the average approximate age of the oldest non-deleted message in the queue is more than 5 seconds for an interval of 5 minutes", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=CPUUtilization statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "metric=ApproximateAgeOfOldestMessage Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account,region,namespace,queuename " } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "threshold": 5, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "LessThan", - "field": null, + "threshold": 5, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High Disk Queue Depth", - "description": "This alert fires when the average disk queue depth for a database is high (>=5) for an interval of 5 minutes. Higher this value, higher will be the number of outstanding I/Os (read/write requests) waiting to access the disk, which will impact the performance of your application.", + "name": "AWS Application Load Balancer - Access from Highly Malicious Sources", + "description": "This alert fires when an Application load balancer is accessed from highly malicious IP addresses within last 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", + "monitorType": "Logs", "evaluationDelay": "0m", "alertName": null, "runAs": null, @@ -1625,39 +2143,42 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=DiskQueueDepth statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "account=* region=* namespace=aws/applicationelb\n| parse \"* * * * * * * * * * * * \\\"*\\\" \\\"*\\\" * * * \\\"*\\\"\" as Type, DateTime, loadbalancer, Client, Target, RequestProcessingTime, TargetProcessingTime, ResponseProcessingTime, ElbStatusCode, TargetStatusCode, ReceivedBytes, SentBytes, Request, UserAgent, SslCipher, SslProtocol, TargetGroupArn, TraceId\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancer, account, region, namespace\n| threatlookup singleIndicator ClientIp\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as MaliciousConfidence\n| where MaliciousConfidence=\"high\"\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancer, account, region, namespace, MaliciousConfidence, Actor" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "threshold": 0, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "threshold": 0, + "thresholdType": "LessThanOrEqual", + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Account Provisioned Read Capacity", - "description": "This alert fires when we detect that the average read capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "AWS EC2 - High Disk Utilization", + "description": "This alert fires when the average disk utilization within a 5 minute time interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "0m", @@ -1667,35 +2188,40 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=AccountProvisionedReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "Namespace=aws/ec2 metric=Disk_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid, devname" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 85, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] } ] -} +} \ No newline at end of file diff --git a/aws-observability/json/Alerts-App.json b/aws-observability/json/Alerts-App.json index 33be1e9a..fb96a1e4 100644 --- a/aws-observability/json/Alerts-App.json +++ b/aws-observability/json/Alerts-App.json @@ -1,11 +1,11 @@ { "name": "AWS Observability", - "description": "", + "description": "This folder contains all the monitors for AWS Observability solution.", "type": "MonitorsLibraryFolderExport", "children": [ { - "name": "Amazon Elasticache - Multiple Failed Operations", - "description": "This alert fires when we detect multiple failed operations within a 15 minute interval for an ElastiCache service.", + "name": "AWS SQS - Access from Highly Malicious Sources", + "description": "This alert fires when an Application AWS - SQS is accessed from highly malicious IP addresses within last 5 minutes", "type": "MonitorsLibraryMonitorExport", "monitorType": "Logs", "evaluationDelay": "0m", @@ -15,33 +15,27 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/elasticache \"\\\"eventSource\\\":\\\"elasticache.amazonaws.com\\\"\" errorCode errorMessage\n| json \"eventSource\", \"errorCode\", \"errorMessage\", \"userIdentity\", \"requestParameters\", \"responseElements\" as event_source, error_code, error_message, user_identity, requestParameters, responseElements nodrop\n| json field=requestParameters \"cacheClusterId\" as req_cacheClusterId nodrop\n| json field=responseElements \"cacheClusterId\" as res_cacheClusterId nodrop\n| json field=user_identity \"arn\", \"userName\" nodrop \n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop\n| if (isEmpty(userName), user, userName) as user\n| if (isEmpty(req_cacheClusterId), res_cacheClusterId, req_cacheClusterId) as cacheclusterid\n| where event_source matches \"elasticache.amazonaws.com\" and !isEmpty(error_code) and !isEmpty(error_message) and !isEmpty(user)\n| count as event_count by _messageTime, account, region, event_source, error_code, error_message, user, cacheclusterid\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, event_source, error_code, error_message, user, cacheclusterid\n| fields -_messageTime" + "query": "account=* region=* namespace=\"aws/sqs\" eventname eventsource \"sqs.amazonaws.com\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"sourceIPAddress\",\"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, recipient_account_id, requestParameters, responseElements, src_ip, error_code, error_message nodrop\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| json field=requestParameters \"queueUrl\" as queueUrlReq nodrop \n| json field=responseElements \"queueUrl\" as queueUrlRes nodrop\n| where event_source=\"sqs.amazonaws.com\" and !(src_ip matches \"*.amazonaws.com\")\n| if(event_name=\"CreateQueue\", queueUrlRes, queueUrlReq) as queueUrl \n| parse regex field=queueUrl \"(?[^\\/]*$)\"\n| if (isBlank(recipient_account_id), accountid, recipient_account_id) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| count as ip_count by src_ip\n| threatlookup singleIndicator src_ip\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as malicious_confidence\n| where malicious_confidence=\"high\"\n| sort by ip_count, src_ip\n| fields src_ip, malicious_confidence, actor" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 10, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "timeRange": "-5m", + "threshold": 0, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 10, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "timeRange": "-5m", + "threshold": 0, + "thresholdType": "LessThanOrEqual", + "field": null } ], "timeZone": null, @@ -55,50 +49,46 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High Read Latency", - "description": "This alert fires when the average read latency of a database within a 5 minutes time inerval is high (>=5 seconds). High read latency will affect the performance of your application.", + "name": "AWS EC2 - High Total CPU Utilization", + "description": "This alert fires when the average total CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "4m", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=ReadLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/ec2 metric=CPU_Total account=* region=* instanceid=* | avg by account, region, namespace, instanceid" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 85, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -106,8 +96,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS SNS - Failed Notifications", - "description": "This alert fires where there are many failed notifications (>2) within an interval of 5 minutes.", + "name": "Amazon RDS - High Write Latency", + "description": "This alert fires when the average write latency of a database within a 5 minute interval is high (>=5 seconds) . High write latencies will affect the performance of your application.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -117,7 +107,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/sns TopicName=* metric=NumberOfNotificationsFailed Statistic=Sum | sum by account, region, TopicName" + "query": "Namespace=aws/rds metric=WriteLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ @@ -126,8 +116,8 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 2, - "thresholdType": "GreaterThan", + "threshold": 5, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", "minDataPoints": 2 }, @@ -136,8 +126,8 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 2, - "thresholdType": "LessThanOrEqual", + "threshold": 5, + "thresholdType": "LessThan", "occurrenceType": "Always", "minDataPoints": 2 } @@ -153,50 +143,46 @@ "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High Total CPU Utilization", - "description": "This alert fires when the average total CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", + "name": "Amazon RDS - Low Aurora Buffer Cache Hit Ratio", + "description": "This alert fires when the average RDS Aurora buffer cache hit ratio within a 5 minute interval is low (<= 50%). This indicates that a lower percentage of requests were are served by the buffer cache, which could further indicate a degradation in application performance.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=CPU_Total account=* region=* instanceid=* | avg by account, region, namespace, instanceid" + "query": "Namespace=aws/rds metric=BufferCacheHitRatio statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "threshold": 50, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "LessThan", - "field": null, + "threshold": 50, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -204,8 +190,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - High Latency", - "description": "This alert fires when we detect that the average latency for a given Application load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", + "name": "Amazon Elasticache - High Redis Memory Fragmentation Ratio", + "description": "This alert fires when the average Redis memory fragmentation ratio for within a 5 minute interval is high (>=1.5). Value equal to or greater than 1.5 Indicate significant memory fragmentation.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -215,39 +201,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/applicationelb metric=TargetResponseTime Statistic=Average account=* region=* loadbalancer=* | eval(_value*1000) | sum by account, region, namespace, loadbalancer" + "query": "Namespace=aws/elasticache metric=MemoryFragmentationRatio statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 1.5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 1.5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -255,38 +237,40 @@ "automatedPlaybookIds": [] }, { - "name": "AWS SNS - Access from Highly Malicious Sources", - "description": "This alert fires when an Application AWS - SNS is accessed from highly malicious IP addresses within last 5 minutes", + "name": "Amazon ECS - High CPU Utilization", + "description": "This alert fires when the average CPU utilization within a 5 minute interval for a service within a cluster is high (>=85%).", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop\n| where event_source = \"sns.amazonaws.com\" \n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, user_type, arn, username nodrop\n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountid, user nodrop\n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop\n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn\n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn\n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop\n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop\n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname\n| if (isBlank(accountid), recipient_account_id, accountid) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status\n| if (isEmpty(username), user_type, username) as user_type\n| count as ip_count by src_ip, event_name, region, accountid,user_type\n| threatlookup singleIndicator src_ip\n| where _threatlookup.threat_type=\"ip_address\" and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as malicious_confidence\n| where malicious_confidence=\"high\"\n| sum(ip_count) as threat_count by src_ip, event_name, region, accountid, malicious_confidence, actor" + "query": "Namespace=aws/ecs metric=CPUUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", - "field": null + "threshold": 85, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", - "field": null + "threshold": 85, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 } ], "timeZone": null, @@ -300,8 +284,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Max Provisioned Table Read Capacity", - "description": "This alert fires when we detect that the average percentage of read provisioned capacity used by the highest read provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "AWS SNS - Failed Notifications", + "description": "This alert fires where there are many failed notifications (>=5) within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -311,39 +295,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "account=* region=* namespace=aws/sns TopicName=* metric=NumberOfNotificationsFailed Statistic=Sum \n| sum by account, region, TopicName" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "threshold": 2, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, - "thresholdType": "LessThan", - "field": null, + "threshold": 2, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -351,8 +331,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High 4XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", + "name": "Amazon RDS - High Read Latency", + "description": "This alert fires when the average read latency of a database within a 5 minutes time inerval is high (>=5 seconds). High read latency will affect the performance of your application.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -362,47 +342,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway metric=4xxError Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/apigateway metric=count Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along apiname, account, region, namespace" + "query": "Namespace=aws/rds metric=ReadLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -457,7 +425,7 @@ "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -465,8 +433,8 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High Write Latency", - "description": "This alert fires when the average write latency of a database within a 5 minute interval is high (>=5 seconds) . High write latencies will affect the performance of your application.", + "name": "Amazon Elasticache - High Redis Database Memory Usage", + "description": "This alert fires when the average database memory usage within a 5 minute interval for the Redis engine is high (>=95%). When the value reaches 100%, eviction may happen or write operations may fail based on ElastiCache policies thereby impacting application performance.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -476,39 +444,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=WriteLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/elasticache metric=DatabaseMemoryUsagePercentage statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 95, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 95, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -516,44 +480,40 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - Access from Highly Malicious Sources", - "description": "This alert fires when an Application load balancer is accessed from highly malicious IP addresses within last 5 minutes", + "name": "AWS SQS - Messages not processed", + "description": "This alert fires when we detect messages that have been received by a consumer, but have not been processed (deleted/failed). That is, the average number of messages that are in flight are >=20 for an interval of 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/applicationelb\n| parse \"* * * * * * * * * * * * \\\"*\\\" \\\"*\\\" * * * \\\"*\\\"\" as Type, DateTime, loadbalancer, Client, Target, RequestProcessingTime, TargetProcessingTime, ResponseProcessingTime, ElbStatusCode, TargetStatusCode, ReceivedBytes, SentBytes, Request, UserAgent, SslCipher, SslProtocol, TargetGroupArn, TraceId\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancer, account, region, namespace\n| threatlookup singleIndicator ClientIp\n| where _threatlookup.threat_type=\"ip_address\" and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as MaliciousConfidence\n| where MaliciousConfidence=\"high\"\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancer, account, region, namespace, MaliciousConfidence, Actor" + "query": "metric=ApproximateNumberOfMessagesNotVisible Statistic=avg region = * account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "threshold": 20, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "threshold": 20, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 } ], "timeZone": null, @@ -567,8 +527,8 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High CPU Utilization", - "description": "This alert fires when the average CPU utilization within a 5 minute interval for a host is high (>=90%). The CPUUtilization metric includes total CPU utilization across application, operating system and management processes. We highly recommend monitoring CPU utilization for hosts with two vCPUs or less.", + "name": "AWS DynamoDB - High Max Provisioned Table Read Capacity", + "description": "This alert fires when we detect that the average percentage of read provisioned capacity used by the highest read provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -578,39 +538,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=CPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" + "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, + "threshold": 80, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, + "threshold": 80, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -618,8 +574,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High Integration Latency", - "description": "This alert fires when we detect that the average integration latency for a given API Gateway is greater than or equal to one second for 5 minutes.", + "name": "AWS DynamoDB - High Max Provisioned Table Write Capacity", + "description": "This alert fires when we detect that the average percentage of write provisioned capacity used by the highest write provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -629,39 +585,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway metric=IntegrationLatency statistic=Average account=* region=* apiname=* | avg by apiname, namespace, region, account" + "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, + "threshold": 80, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, + "threshold": 80, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -669,8 +621,8 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High Redis Database Memory Usage", - "description": "This alert fires when the average database memory usage within a 5 minute interval for the Redis engine is high (>=95%). When the value reaches 100%, eviction may happen or write operations may fail based on ElastiCache policies thereby impacting application performance.", + "name": "Amazon RDS - Low Burst Balance", + "description": "This alert fires when we observe a low burst balance (<= 50%) for a given database. A low burst balance indicates you won't be able to scale up as fast for burstable database workloads on gp2 volumes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -680,39 +632,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=DatabaseMemoryUsagePercentage statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" + "query": "Namespace=aws/rds metric=BurstBalance statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 95, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "threshold": 50, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 95, - "thresholdType": "LessThan", - "field": null, + "threshold": 50, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -720,44 +668,40 @@ "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - Multiple Tables deleted", - "description": "This alert fires when five or more tables are deleted within 15 minutes.", + "name": "AWS SQS - Queue has stopped receiving messages", + "description": "This alert fires when we detect that the queue has stopped receiving messages. That is, the average number of messages received in the queue <1 for an interval of 30 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/dynamodb eventSource \"dynamodb.amazonaws.com\"\n| json \"eventSource\", \"eventName\", \"requestParameters.tableName\", \"sourceIPAddress\", \"userIdentity.userName\", \"userIdentity.sessionContext.sessionIssuer.userName\" as event_source, event_name, tablename, SourceIp, UserName, ContextUserName nodrop\n| where event_source = \"dynamodb.amazonaws.com\" and event_name = \"DeleteTable\"\n| if (isEmpty(UserName), ContextUserName, UserName) as user\n| count by _messageTime, account, region, namespace, event_name, user, tablename\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, namespace, event_name, user, tablename\n| fields -_messageTime" + "query": "metric=NumberOfMessagesReceived Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 5, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "timeRange": "-30m", + "threshold": 1, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 5, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "timeRange": "-30m", + "threshold": 1, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 } ], "timeZone": null, @@ -771,8 +715,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Max Provisioned Table Write Capacity", - "description": "This alert fires when we detect that the average percentage of write provisioned capacity used by the highest write provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "Amazon RDS - High Disk Queue Depth", + "description": "This alert fires when the average disk queue depth for a database is high (>=5) for an interval of 5 minutes. Higher this value, higher will be the number of outstanding I/Os (read/write requests) waiting to access the disk, which will impact the performance of your application.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -782,39 +726,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "Namespace=aws/rds metric=DiskQueueDepth statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -822,8 +762,8 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - Low Aurora Buffer Cache Hit Ratio", - "description": "This alert fires when the average RDS Aurora buffer cache hit ratio within a 5 minute interval is low (<= 50%). This indicates that a lower percentage of requests were are served by the buffer cache, which could further indicate a degradation in application performance.", + "name": "AWS Application Load Balancer - High 5XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -833,39 +773,43 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=BufferCacheHitRatio statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_5XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/applicationelb metric=RequestCount Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along loadbalancer, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "LessThanOrEqual", - "field": null, + "threshold": 5, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "GreaterThan", - "field": null, + "threshold": 5, + "thresholdType": "LessThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -873,50 +817,44 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - Low Redis Cache Hit Rate", - "description": "This alert fires when the average cache hit rate for Redis within a 5 minute interval is low (<= 80%). This indicates low efficiency of the Redis instance. If cache ratio is lower than 80%, that indicates a significant amount of keys are either evicted, expired, or don't exist.", + "name": "AWS SNS - Failed Events", + "description": "This alert fires when an SNS app has high number of failed events (>5) within last 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=CacheHitRate statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" + "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" errorCode \n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop \n| where event_source = \"sns.amazonaws.com\" and !isblank(error_code) \n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop \n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop \n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop \n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn \n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn \n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop \n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop \n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname \n| if (isBlank(accountid), recipient_account_id, accountid) as accountid \n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| if (isEmpty(username), user, username) as user \n| count as event_count by event_name, error_code, error_message, region, src_ip, accountid, user, type, request_id, topicname, topic_arn, user_agent" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, - "thresholdType": "LessThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 5, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, - "thresholdType": "GreaterThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 5, + "thresholdType": "LessThanOrEqual", + "field": null } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -924,8 +862,8 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High Redis Memory Fragmentation Ratio", - "description": "This alert fires when the average Redis memory fragmentation ratio for within a 5 minute interval is high (>=1.5). Value equal to or greater than 1.5 Indicate significant memory fragmentation.", + "name": "AWS API Gateway - High 4XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -935,39 +873,43 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=MemoryFragmentationRatio statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" + "query": "Namespace=aws/apigateway metric=4xxError Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/apigateway metric=count Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along apiname, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1.5, + "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1.5, + "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -975,50 +917,54 @@ "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High Memory Utilization", - "description": "This alert fires when the average memory utilization within a 5 minute interval for an EC2 instance is high (>=85%).", + "name": "AWS Network Load Balancer - High TLS Negotiation Errors", + "description": "This alert fires when we detect that there are too many TLS Negotiation Errors (>=10%) within an interval of 5 minutes for a given network load balancer", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=Mem_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid" + "query": "Namespace=aws/NetworkELB metric=ClientTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/NetworkELB metric=TargetTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" + }, + { + "rowId": "C", + "query": "(#A + #B) along LoadBalancer, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 10, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 10, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1026,8 +972,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - High 5XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", + "name": "AWS DynamoDB - High Account Provisioned Write Capacity", + "description": "This alert fires when we detect that the average write capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1037,47 +983,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_5XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/applicationelb metric=RequestCount Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along loadbalancer, account, region, namespace" + "query": "Namespace=aws/dynamodb metric=AccountProvisionedWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 80, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 80, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1085,50 +1019,46 @@ "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High System CPU Utilization", - "description": "This alert fires when the average system CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", + "name": "AWS API Gateway - High Integration Latency", + "description": "This alert fires when we detect that the average integration latency for a given API Gateway is greater than or equal to one second for 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=CPU_Sys account=* region=* instanceid=* | avg by account, region, namespace, instanceid" + "query": "Namespace=aws/apigateway metric=IntegrationLatency statistic=Average account=* region=* apiname=* | avg by apiname, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 1000, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 1000, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1136,38 +1066,40 @@ "automatedPlaybookIds": [] }, { - "name": "AWS SNS - Failed Events", - "description": "This alert fires when an SNS app has high number of failed events (>5) within last 5 minutes", + "name": "AWS Classic Load Balancer - High Latency", + "description": "This alert fires when we detect that the average latency for a given Classic load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" errorCode\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop\n| where event_source = \"sns.amazonaws.com\" and !isblank(error_code)\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop\n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop\n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn\n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn\n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop\n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop\n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname\n| if (isBlank(accountid), recipient_account_id, accountid) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status\n| if (isEmpty(username), user, username) as user\n| count as event_count by event_name, error_code, error_message, region, src_ip, accountid, user, type, request_id, topicname, topic_arn, user_agent\n" + "query": "Namespace=aws/elb metric=Latency Statistic=Average account=* region=* loadbalancername=* | eval(_value*1000) | sum by account, region, namespace, loadbalancername" } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThan", - "field": null + "threshold": 3000, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThanOrEqual", - "field": null + "threshold": 3000, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 } ], "timeZone": null, @@ -1181,8 +1113,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Network Load Balancer - High Unhealthy Hosts", - "description": "This alert fires when we detect that are there are too many unhealthy hosts (>=10%) within an interval of 5 minutes for a given network load balancer", + "name": "AWS Application Load Balancer - High Latency", + "description": "This alert fires when we detect that the average latency for a given Application load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1192,47 +1124,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/NetworkELB metric=UnHealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/NetworkELB metric=HealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / (#A + #B) along LoadBalancer, AvailabilityZone, account, region, namespace" + "query": "Namespace=aws/applicationelb metric=TargetResponseTime Statistic=Average account=* region=* loadbalancer=* | eval(_value*1000) | sum by account, region, namespace, loadbalancer" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, + "threshold": 3000, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, + "threshold": 3000, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1256,34 +1176,30 @@ ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 80, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 80, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1291,8 +1207,8 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High CPU Utilization", - "description": "This alert fires when we detect that the average CPU utilization for a database is high (>=85%) for an interval of 5 minutes.", + "name": "AWS Lambda - Low Provisioned Concurrency Utilization", + "description": "This alert fires when the average provisioned concurrency utilization for 5 minutes is low (<= 50%). This indicates low provisioned concurrency utilization efficiency.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1302,39 +1218,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=CPUUtilization statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/lambda metric=ProvisionedConcurrencyUtilization statistic=Average account=* region=* functionname=* | avg by functionname, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "threshold": 50, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "LessThan", - "field": null, + "threshold": 50, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1342,8 +1254,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - High 4XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", + "name": "AWS Network Load Balancer - High Unhealthy Hosts", + "description": "This alert fires when we detect that are there are too many unhealthy hosts (>=10%) within an interval of 5 minutes for a given network load balancer", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1353,47 +1265,43 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_4XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + "query": "Namespace=aws/NetworkELB metric=UnHealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" }, { "rowId": "B", - "query": "Namespace=aws/applicationelb metric=RequestCount Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + "query": "Namespace=aws/NetworkELB metric=HealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" }, { "rowId": "C", - "query": "#A * 100 / #B along loadbalancer, account, region, namespace" + "query": "#A * 100 / (#A + #B) along LoadBalancer, AvailabilityZone, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 10, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 10, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1401,7 +1309,7 @@ "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High 5XX Errors", + "name": "AWS Classic Load Balancer - High 5XX Errors", "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", @@ -1412,47 +1320,43 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway metric=5xxError Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + "query": "Namespace=aws/elb metric=HTTPCode_ELB_5XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" }, { "rowId": "B", - "query": "Namespace=aws/apigateway metric=count Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" }, { "rowId": "C", - "query": "#A * 100 / #B along account, region, namespace" + "query": "#A * 100 / #B along loadbalancername, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1460,8 +1364,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - High Latency", - "description": "This alert fires when we detect that the average latency for a given Classic load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", + "name": "Amazon Elasticache - High Engine CPU Utilization", + "description": "This alert fires when the average CPU utilization for the Redis engine process within a 5 minute interval is high (>=90%). For larger node types with four vCPUs or more, use the EngineCPUUtilization metric to monitor and set thresholds for scaling.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1471,7 +1375,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elb metric=Latency Statistic=Average account=* region=* loadbalancername=* | eval(_value*1000) | sum by account, region, namespace, loadbalancername" + "query": "Namespace=aws/elasticache metric=EngineCPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" } ], "triggers": [ @@ -1480,7 +1384,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 90, "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", "minDataPoints": 2 @@ -1490,7 +1394,7 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 90, "thresholdType": "LessThan", "occurrenceType": "Always", "minDataPoints": 2 @@ -1499,7 +1403,7 @@ "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1507,50 +1411,46 @@ "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Account Provisioned Write Capacity", - "description": "This alert fires when we detect that the average write capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "AWS EC2 - High Memory Utilization", + "description": "This alert fires when the average memory utilization within a 5 minute interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "4m", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=AccountProvisionedWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "Namespace=aws/ec2 metric=Mem_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 85, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1558,8 +1458,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Lambda - Low Provisioned Concurrency Utilization", - "description": "This alert fires when the average provisioned concurrency utilization for 5 minutes is low (<= 50%). This indicates low provisioned concurrency utilization efficiency.", + "name": "Amazon ECS - High Memory Utilization", + "description": "This alert fires when the average memory utilization within a 5 minute interval for a service within a cluster is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1569,39 +1469,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/lambda metric=ProvisionedConcurrencyUtilization statistic=Average account=* region=* functionname=* | avg by functionname, namespace, region, account" + "query": "Namespace=aws/ecs metric=MemoryUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "LessThanOrEqual", - "field": null, + "threshold": 85, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "GreaterThan", - "field": null, + "threshold": 85, + "thresholdType": "LessThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1609,8 +1505,8 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon ECS - High Memory Utilization", - "description": "This alert fires when the average memory utilization within a 5 minute interval for a service within a cluster is high (>=85%).", + "name": "AWS Lambda - High Percentage of Failed Requests", + "description": "This alert fires when we detect a large number of failed Lambda requests (>5%) within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1620,39 +1516,43 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/ecs metric=MemoryUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" + "query": "Namespace=aws/lambda metric=Errors Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/lambda metric=Invocations Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along functionname, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1660,8 +1560,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS SQS - Access from highly malicious sources", - "description": "This alert fires when an AWS - SQS resource is accessed from highly malicious IP addresses within last 5 minutes", + "name": "AWS DynamoDB - Multiple Tables deleted", + "description": "This alert fires when five or more tables are deleted within 15 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Logs", "evaluationDelay": "0m", @@ -1671,7 +1571,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=\"aws/sqs\" eventname eventsource \"sqs.amazonaws.com\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"sourceIPAddress\",\"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, recipient_account_id, requestParameters, responseElements, src_ip, error_code, error_message nodrop\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| json field=requestParameters \"queueUrl\" as queueUrlReq nodrop \n| json field=responseElements \"queueUrl\" as queueUrlRes nodrop\n| where event_source=\"sqs.amazonaws.com\" and !(src_ip matches \"*.amazonaws.com\")\n| if(event_name=\"CreateQueue\", queueUrlRes, queueUrlReq) as queueUrl \n| parse regex field=queueUrl \"(?[^\\/]*$)\"\n| if (isBlank(recipient_account_id), accountid, recipient_account_id) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| count as ip_count by src_ip\n| threatlookup singleIndicator src_ip\n| where _threatlookup.threat_type=\"ip_address\" and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as malicious_confidence\n| where malicious_confidence=\"high\"\n| sort by ip_count, src_ip\n| fields src_ip, malicious_confidence, actor" + "query": "account=* region=* namespace=aws/dynamodb eventSource \"dynamodb.amazonaws.com\"\n| json \"eventSource\", \"eventName\", \"requestParameters.tableName\", \"sourceIPAddress\", \"userIdentity.userName\", \"userIdentity.sessionContext.sessionIssuer.userName\" as event_source, event_name, tablename, SourceIp, UserName, ContextUserName nodrop\n| where event_source = \"dynamodb.amazonaws.com\" and event_name = \"DeleteTable\"\n| if (isEmpty(UserName), ContextUserName, UserName) as user\n| count by _messageTime, account, region, namespace, event_name, user, tablename\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, namespace, event_name, user, tablename\n| fields -_messageTime" } ], "triggers": [ @@ -1679,18 +1579,18 @@ "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", + "timeRange": "-15m", + "threshold": 5, + "thresholdType": "GreaterThanOrEqual", "field": null }, { "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", + "timeRange": "-15m", + "threshold": 5, + "thresholdType": "LessThan", "field": null } ], @@ -1705,54 +1605,44 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - High 5XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", + "name": "Amazon Elasticache - Multiple Failed Operations", + "description": "This alert fires when we detect multiple failed operations within a 15 minute interval for an ElastiCache service.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/elb metric=HTTPCode_ELB_5XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along loadbalancername, account, region, namespace" + "query": "account=* region=* namespace=aws/elasticache \"\\\"eventSource\\\":\\\"elasticache.amazonaws.com\\\"\" errorCode errorMessage\n| json \"eventSource\", \"errorCode\", \"errorMessage\", \"userIdentity\", \"requestParameters\", \"responseElements\" as event_source, error_code, error_message, user_identity, requestParameters, responseElements nodrop\n| json field=requestParameters \"cacheClusterId\" as req_cacheClusterId nodrop\n| json field=responseElements \"cacheClusterId\" as res_cacheClusterId nodrop\n| json field=user_identity \"arn\", \"userName\" nodrop \n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop\n| if (isEmpty(userName), user, userName) as user\n| if (isEmpty(req_cacheClusterId), res_cacheClusterId, req_cacheClusterId) as cacheclusterid\n| where event_source matches \"elasticache.amazonaws.com\" and !isEmpty(error_code) and !isEmpty(error_message) and !isEmpty(user)\n| count as event_count by _messageTime, account, region, event_source, error_code, error_message, user, cacheclusterid\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, event_source, error_code, error_message, user, cacheclusterid\n| fields -_messageTime" } ], "triggers": [ { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 5, + "timeRange": "-15m", + "threshold": 10, "thresholdType": "GreaterThanOrEqual", - "occurrenceType": "Always", - "minDataPoints": 2 + "field": null }, { - "detectionMethod": "MetricsStaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 5, + "timeRange": "-15m", + "threshold": 10, "thresholdType": "LessThan", - "occurrenceType": "Always", - "minDataPoints": 2 + "field": null } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1760,50 +1650,46 @@ "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High Disk Utilization", - "description": "This alert fires when the average disk utilization within a 5 minute time interval for an EC2 instance is high (>=85%).", + "name": "Amazon RDS - High CPU Utilization", + "description": "This alert fires when we detect that the average CPU utilization for a database is high (>=85%) for an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=Disk_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid, devname" + "query": "Namespace=aws/rds metric=CPUUtilization statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 85, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1811,50 +1697,44 @@ "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High Latency", - "description": "This alert fires when we detect that the average latency for a given API Gateway is greater than or equal to one second for 5 minutes.", + "name": "AWS Classic Load Balancer - Access from Highly Malicious Sources", + "description": "This alert fires when the Classic load balancer is accessed from highly malicious IP addresses within last 5 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway metric=Latency statistic=Average account=* region=* apiname=* | avg by apiname, namespace, region, account" + "query": "account=* region=* namespace=aws/elb\n| parse \"* * * * * * * * * * * \\\"*\\\" \\\"*\\\" * *\" as datetime, loadbalancername, client, backend, request_processing_time, backend_processing_time, response_processing_time, elb_status_code, backend_status_code, received_bytes, sent_bytes, request, user_agent, ssl_cipher, ssl_protocol\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancername, account, region, namespace\n| threatlookup singleIndicator ClientIp\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as MaliciousConfidence\n| where MaliciousConfidence=\"high\"\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancername, account, region, namespace, MaliciousConfidence, Actor" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 0, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 0, + "thresholdType": "LessThanOrEqual", + "field": null } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1862,58 +1742,44 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Lambda - High Percentage of Failed Requests", - "description": "This alert fires when we detect a large number of failed Lambda requests (>5%) within an interval of 5 minutes.", + "name": "AWS SNS - Access from Highly Malicious Sources", + "description": "This alert fires when an Application AWS - SNS is accessed from highly malicious IP addresses within last 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/lambda metric=Errors Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/lambda metric=Invocations Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along functionname, account, region, namespace" + "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop\n| where event_source = \"sns.amazonaws.com\" \n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, user_type, arn, username nodrop\n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountid, user nodrop\n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop\n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn\n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn\n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop\n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop\n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname\n| if (isBlank(accountid), recipient_account_id, accountid) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status\n| if (isEmpty(username), user_type, username) as user_type\n| count as ip_count by src_ip, event_name, region, accountid,user_type\n| threatlookup singleIndicator src_ip\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as malicious_confidence\n| where malicious_confidence=\"high\"\n| sum(ip_count) as threat_count by src_ip, event_name, region, accountid, malicious_confidence, actor" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 0, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 0, + "thresholdType": "LessThanOrEqual", + "field": null } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -1921,8 +1787,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS SQS - Queue has stopped receiving messages", - "description": "This alert fires when we detect that the queue has stopped receiving messages. That is, the average number of messages received in the queue <1 for an interval of 30 minutes.", + "name": "AWS Application Load Balancer - High 4XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1932,7 +1798,15 @@ "queries": [ { "rowId": "A", - "query": "metric=NumberOfMessagesReceived Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " + "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_4XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/applicationelb metric=RequestCount Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along loadbalancer, account, region, namespace" } ], "triggers": [ @@ -1940,21 +1814,21 @@ "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-30m", - "threshold": 1, - "thresholdType": "LessThan", + "timeRange": "-5m", + "threshold": 5, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "minDataPoints": 3 + "minDataPoints": 2 }, { "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-30m", - "threshold": 1, - "thresholdType": "GreaterThanOrEqual", + "timeRange": "-5m", + "threshold": 5, + "thresholdType": "LessThan", "occurrenceType": "Always", - "minDataPoints": 3 + "minDataPoints": 2 } ], "timeZone": null, @@ -1968,18 +1842,18 @@ "automatedPlaybookIds": [] }, { - "name": "AWS SQS - Message processing not fast enough", - "description": "This alert fires when we detect message processing is not fast enough. That is, the average approximate age of the oldest non-deleted message in the queue is more than 5 seconds for an interval of 5 minutes.", + "name": "AWS EC2 - High System CPU Utilization", + "description": "This alert fires when the average system CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "4m", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "metric=ApproximateAgeOfOldestMessage Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account,region,namespace,queuename " + "query": "Namespace=aws/ec2 metric=CPU_Sys account=* region=* instanceid=* | avg by account, region, namespace, instanceid" } ], "triggers": [ @@ -1988,20 +1862,20 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThan", + "threshold": 85, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "minDataPoints": 3 + "minDataPoints": 2 }, { "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThanOrEqual", + "threshold": 85, + "thresholdType": "LessThan", "occurrenceType": "Always", - "minDataPoints": 3 + "minDataPoints": 2 } ], "timeZone": null, @@ -2015,38 +1889,40 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - Access from Highly Malicious Sources", - "description": "This alert fires when the Classic load balancer is accessed from highly malicious IP addresses within last 5 minutes.", + "name": "AWS API Gateway - High Latency", + "description": "This alert fires when we detect that the average latency for a given API Gateway is greater than or equal to one second for 5 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/elb\n| parse \"* * * * * * * * * * * \\\"*\\\" \\\"*\\\" * *\" as datetime, loadbalancername, client, backend, request_processing_time, backend_processing_time, response_processing_time, elb_status_code, backend_status_code, received_bytes, sent_bytes, request, user_agent, ssl_cipher, ssl_protocol\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancername, account, region, namespace\n| threatlookup singleIndicator ClientIp\n| where _threatlookup.threat_type=\"ip_address\" and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as MaliciousConfidence\n| where MaliciousConfidence=\"high\"\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancername, account, region, namespace, MaliciousConfidence, Actor" + "query": "Namespace=aws/apigateway metric=Latency statistic=Average account=* region=* apiname=* | avg by apiname, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", - "field": null + "threshold": 1000, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", - "field": null + "threshold": 1000, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 } ], "timeZone": null, @@ -2060,8 +1936,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS Network Load Balancer - High TLS Negotiation Errors", - "description": "This alert fires when we detect that there are too many TLS Negotiation Errors (>=10%) within an interval of 5 minutes for a given network load balancer", + "name": "Amazon Elasticache - High CPU Utilization", + "description": "This alert fires when the average CPU utilization within a 5 minute interval for a host is high (>=90%). The CPUUtilization metric includes total CPU utilization across application, operating system and management processes. We highly recommend monitoring CPU utilization for hosts with two vCPUs or less.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2071,47 +1947,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/NetworkELB metric=ClientTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/NetworkELB metric=TargetTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" - }, - { - "rowId": "C", - "query": "(#A + #B) along LoadBalancer, account, region, namespace" + "query": "Namespace=aws/elasticache metric=CPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, + "threshold": 90, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, + "threshold": 90, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -2119,8 +1983,8 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High Engine CPU Utilization", - "description": "This alert fires when the average CPU utilization for the Redis engine process within a 5 minute interval is high (>=90%). For larger node types with four vCPUs or more, use the EngineCPUUtilization metric to monitor and set thresholds for scaling.", + "name": "AWS API Gateway - High 5XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2130,39 +1994,43 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=EngineCPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" + "query": "Namespace=aws/apigateway metric=5xxError Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/apigateway metric=count Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, + "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, + "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -2170,8 +2038,8 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - Low Burst Balance", - "description": "This alert fires when we observe a low burst balance (<= 50%) for a given database. A low burst balance indicates you won't be able to scale up as fast for burstable database workloads on gp2 volumes.", + "name": "Amazon Elasticache - Low Redis Cache Hit Rate", + "description": "This alert fires when the average cache hit rate for Redis within a 5 minute interval is low (<= 80%). This indicates low efficiency of the Redis instance. If cache ratio is lower than 80%, that indicates a significant amount of keys are either evicted, expired, or don't exist.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2181,39 +2049,35 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=BurstBalance statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/elasticache metric=CacheHitRate statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, + "threshold": 80, "thresholdType": "LessThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, + "threshold": 80, "thresholdType": "GreaterThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -2221,8 +2085,8 @@ "automatedPlaybookIds": [] }, { - "name": "AWS SQS - Messages not processed", - "description": "This alert fires when we detect messages that have been received by a consumer, but have not been processed (deleted/failed). That is, the average number of messages that are in flight are >=20 for an interval of 5 minutes.", + "name": "AWS SQS - Message processing not fast enough", + "description": "This alert fires when we detect message processing is not fast enough. That is, the average approximate age of the oldest non-deleted message in the queue is more than 5 seconds for an interval of 5 minutes", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2232,7 +2096,7 @@ "queries": [ { "rowId": "A", - "query": "metric=ApproximateNumberOfMessagesNotVisible Statistic=avg region = * account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " + "query": "metric=ApproximateAgeOfOldestMessage Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account,region,namespace,queuename " } ], "triggers": [ @@ -2241,20 +2105,20 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 20, - "thresholdType": "GreaterThanOrEqual", + "threshold": 5, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "minDataPoints": 3 + "minDataPoints": 2 }, { "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 20, - "thresholdType": "LessThan", + "threshold": 5, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "minDataPoints": 3 + "minDataPoints": 2 } ], "timeZone": null, @@ -2268,50 +2132,44 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon ECS - High CPU Utilization", - "description": "This alert fires when the average CPU utilization within a 5 minute interval for a service within a cluster is high (>=85%).", + "name": "AWS Application Load Balancer - Access from Highly Malicious Sources", + "description": "This alert fires when an Application load balancer is accessed from highly malicious IP addresses within last 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ecs metric=CPUUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" + "query": "account=* region=* namespace=aws/applicationelb\n| parse \"* * * * * * * * * * * * \\\"*\\\" \\\"*\\\" * * * \\\"*\\\"\" as Type, DateTime, loadbalancer, Client, Target, RequestProcessingTime, TargetProcessingTime, ResponseProcessingTime, ElbStatusCode, TargetStatusCode, ReceivedBytes, SentBytes, Request, UserAgent, SslCipher, SslProtocol, TargetGroupArn, TraceId\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancer, account, region, namespace\n| threatlookup singleIndicator ClientIp\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as MaliciousConfidence\n| where MaliciousConfidence=\"high\"\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancer, account, region, namespace, MaliciousConfidence, Actor" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 0, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 0, + "thresholdType": "LessThanOrEqual", + "field": null } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null, @@ -2319,50 +2177,46 @@ "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High Disk Queue Depth", - "description": "This alert fires when the average disk queue depth for a database is high (>=5) for an interval of 5 minutes. Higher this value, higher will be the number of outstanding I/Os (read/write requests) waiting to access the disk, which will impact the performance of your application.", + "name": "AWS EC2 - High Disk Utilization", + "description": "This alert fires when the average disk utilization within a 5 minute time interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "4m", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=DiskQueueDepth statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/ec2 metric=Disk_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid, devname" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 85, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, "monitorTemplateId": null,