diff --git a/aws-observability-terraform/app-modules/alb/app.tf b/aws-observability-terraform/app-modules/alb/app.tf index 2ed1f7f0..cfad3a2e 100644 --- a/aws-observability-terraform/app-modules/alb/app.tf +++ b/aws-observability-terraform/app-modules/alb/app.tf @@ -27,7 +27,7 @@ module "alb_module" { monitor_is_disabled = var.monitors_disabled monitor_evaluation_delay = "0m" queries = { - A = "account=* region=* namespace=aws/applicationelb\n| parse \"* * * * * * * * * * * * \\\"*\\\" \\\"*\\\" * * * \\\"*\\\"\" as Type, DateTime, loadbalancer, Client, Target, RequestProcessingTime, TargetProcessingTime, ResponseProcessingTime, ElbStatusCode, TargetStatusCode, ReceivedBytes, SentBytes, Request, UserAgent, SslCipher, SslProtocol, TargetGroupArn, TraceId\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancer, account, region, namespace\n| lookup type, actor, raw, threatlevel as MaliciousConfidence from sumo://threat/cs on threat=ClientIp \n| json field=raw \"labels[*].name\" as LabelName \n| replace(LabelName, \"\\\\/\",\"->\") as LabelName\n| replace(LabelName, \"\\\"\",\" \") as LabelName\n| where type=\"ip_address\" and MaliciousConfidence=\"high\"\n| if (isEmpty(actor), \"Unassigned\", actor) as Actor\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancer, account, region, namespace, MaliciousConfidence, Actor, LabelName" + A = "account=* region=* namespace=aws/applicationelb\n| parse \"* * * * * * * * * * * * \\\"*\\\" \\\"*\\\" * * * \\\"*\\\"\" as Type, DateTime, loadbalancer, Client, Target, RequestProcessingTime, TargetProcessingTime, ResponseProcessingTime, ElbStatusCode, TargetStatusCode, ReceivedBytes, SentBytes, Request, UserAgent, SslCipher, SslProtocol, TargetGroupArn, TraceId\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancer, account, region, namespace\n| threatlookup singleIndicator ClientIp\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as MaliciousConfidence\n| where MaliciousConfidence=\"high\"\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancer, account, region, namespace, MaliciousConfidence, Actor" } triggers = [ { diff --git a/aws-observability-terraform/app-modules/elb/app.tf b/aws-observability-terraform/app-modules/elb/app.tf index 7816a6c7..6519be97 100644 --- a/aws-observability-terraform/app-modules/elb/app.tf +++ b/aws-observability-terraform/app-modules/elb/app.tf @@ -25,7 +25,7 @@ module "classic_elb_module" { monitor_is_disabled = var.monitors_disabled monitor_evaluation_delay = "0m" queries = { - A = "account=* region=* namespace=aws/elb\n| parse \"* * * * * * * * * * * \\\"*\\\" \\\"*\\\" * *\" as datetime, loadbalancername, client, backend, request_processing_time, backend_processing_time, response_processing_time, elb_status_code, backend_status_code, received_bytes, sent_bytes, request, user_agent, ssl_cipher, ssl_protocol\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancername, account, region, namespace\n| lookup type, actor, raw, threatlevel as MaliciousConfidence from sumo://threat/cs on threat=ClientIp \n| json field=raw \"labels[*].name\" as LabelName \n| replace(LabelName, \"\\\\/\",\"->\") as LabelName\n| replace(LabelName, \"\\\"\",\" \") as LabelName\n| where type=\"ip_address\" and MaliciousConfidence=\"high\"\n| if (isEmpty(actor), \"Unassigned\", actor) as Actor\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancername, account, region, namespace, MaliciousConfidence, Actor, LabelName" + A = "account=* region=* namespace=aws/elb\n| parse \"* * * * * * * * * * * \\\"*\\\" \\\"*\\\" * *\" as datetime, loadbalancername, client, backend, request_processing_time, backend_processing_time, response_processing_time, elb_status_code, backend_status_code, received_bytes, sent_bytes, request, user_agent, ssl_cipher, ssl_protocol\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancername, account, region, namespace\n| threatlookup singleIndicator ClientIp\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as MaliciousConfidence\n| where MaliciousConfidence=\"high\"\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancername, account, region, namespace, MaliciousConfidence, Actor" } triggers = [ { diff --git a/aws-observability-terraform/app-modules/sns/app.tf b/aws-observability-terraform/app-modules/sns/app.tf index b9703c09..c9fb38c4 100644 --- a/aws-observability-terraform/app-modules/sns/app.tf +++ b/aws-observability-terraform/app-modules/sns/app.tf @@ -97,7 +97,7 @@ module "sns_module" { monitor_is_disabled = var.monitors_disabled monitor_evaluation_delay = "0m" queries = { - A = "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" sourceIPAddress \n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop \n| where event_source = \"sns.amazonaws.com\" \n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, user_type, arn, username nodrop \n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountid, user nodrop \n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop \n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn \n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn \n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop \n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop \n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname \n| if (isBlank(accountid), recipient_account_id, accountid) as accountid \n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| if (isEmpty(username), user_type, username) as user_type \n| count as ip_count by src_ip, event_name, region, accountid,user_type \n| lookup type, actor, raw, threatlevel as malicious_confidence from sumo://threat/cs on threat=src_ip \n| where type=\"ip_address\" and malicious_confidence = \"high\" \n| json field=raw \"labels[*].name\" as label_name \n| replace(label_name, \"\\\\/\",\"->\") as label_name \n| replace(label_name, \"\\\"\",\" \") as label_name \n| if (isEmpty(actor), \"Unassigned\", actor) as actor \n| sum(ip_count) as threat_count by src_ip, event_name, region, accountid, malicious_confidence, actor, label_name" + A = "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop\n| where event_source = \"sns.amazonaws.com\" \n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, user_type, arn, username nodrop\n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountid, user nodrop\n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop\n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn\n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn\n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop\n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop\n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname\n| if (isBlank(accountid), recipient_account_id, accountid) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status\n| if (isEmpty(username), user_type, username) as user_type\n| count as ip_count by src_ip, event_name, region, accountid,user_type\n| threatlookup singleIndicator src_ip\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as malicious_confidence\n| where malicious_confidence=\"high\"\n| sum(ip_count) as threat_count by src_ip, event_name, region, accountid, malicious_confidence, actor" } triggers = [ diff --git a/aws-observability-terraform/app-modules/sqs/app.tf b/aws-observability-terraform/app-modules/sqs/app.tf index fe0072d4..dad43377 100644 --- a/aws-observability-terraform/app-modules/sqs/app.tf +++ b/aws-observability-terraform/app-modules/sqs/app.tf @@ -97,7 +97,7 @@ module "sqs_module" { monitor_is_disabled = var.monitors_disabled monitor_evaluation_delay = "0m" queries = { - A = "account=* region=* namespace=\"aws/sqs\" eventname eventsource \"sqs.amazonaws.com\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"sourceIPAddress\",\"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, recipient_account_id, requestParameters, responseElements, src_ip, error_code, error_message nodrop\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| json field=requestParameters \"queueUrl\" as queueUrlReq nodrop \n| json field=responseElements \"queueUrl\" as queueUrlRes nodrop\n| where event_source=\"sqs.amazonaws.com\" and !(src_ip matches \"*.amazonaws.com\")\n| if(event_name=\"CreateQueue\", queueUrlRes, queueUrlReq) as queueUrl \n| parse regex field=queueUrl \"(?[^\\/]*$)\"\n| if (isBlank(recipient_account_id), accountid, recipient_account_id) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| count as ip_count by src_ip\n| lookup type, actor, raw, threatlevel as malicious_confidence from sumo://threat/cs on threat=src_ip\n| json field=raw \"labels[*].name\" as label_name \n| replace(label_name, \"\\\\/\",\"->\") as label_name\n| replace(label_name, \"\\\"\",\" \") as label_name\n| if (isEmpty(actor), \"Unassigned\", actor) as actor\n| where type=\"ip_address\" and malicious_confidence = \"high\"\n| sort by ip_count, src_ip\n| fields src_ip, malicious_confidence, actor, label_name" + A = "account=* region=* namespace=\"aws/sqs\" eventname eventsource \"sqs.amazonaws.com\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"sourceIPAddress\",\"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, recipient_account_id, requestParameters, responseElements, src_ip, error_code, error_message nodrop\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| json field=requestParameters \"queueUrl\" as queueUrlReq nodrop \n| json field=responseElements \"queueUrl\" as queueUrlRes nodrop\n| where event_source=\"sqs.amazonaws.com\" and !(src_ip matches \"*.amazonaws.com\")\n| if(event_name=\"CreateQueue\", queueUrlRes, queueUrlReq) as queueUrl \n| parse regex field=queueUrl \"(?[^\\/]*$)\"\n| if (isBlank(recipient_account_id), accountid, recipient_account_id) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| count as ip_count by src_ip\n| threatlookup singleIndicator src_ip\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as malicious_confidence\n| where malicious_confidence=\"high\"\n| sort by ip_count, src_ip\n| fields src_ip, malicious_confidence, actor" } triggers = [ { diff --git a/aws-observability-terraform/examples/aws-observability/json/Alerts-App.json b/aws-observability-terraform/examples/aws-observability/json/Alerts-App.json index 2ecae908..fb96a1e4 100644 --- a/aws-observability-terraform/examples/aws-observability/json/Alerts-App.json +++ b/aws-observability-terraform/examples/aws-observability/json/Alerts-App.json @@ -1,13 +1,13 @@ { "name": "AWS Observability", - "description": "", + "description": "This folder contains all the monitors for AWS Observability solution.", "type": "MonitorsLibraryFolderExport", "children": [ { - "name": "AWS API Gateway - High Latency", - "description": "This alert fires when we detect that the average latency for a given API Gateway is greater than or equal to one second for 5 minutes.", + "name": "AWS SQS - Access from Highly Malicious Sources", + "description": "This alert fires when an Application AWS - SQS is accessed from highly malicious IP addresses within last 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", + "monitorType": "Logs", "evaluationDelay": "0m", "alertName": null, "runAs": null, @@ -15,39 +15,42 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway metric=Latency statistic=Average account=* region=* apiname=* | avg by apiname, namespace, region, account" + "query": "account=* region=* namespace=\"aws/sqs\" eventname eventsource \"sqs.amazonaws.com\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"sourceIPAddress\",\"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, recipient_account_id, requestParameters, responseElements, src_ip, error_code, error_message nodrop\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| json field=requestParameters \"queueUrl\" as queueUrlReq nodrop \n| json field=responseElements \"queueUrl\" as queueUrlRes nodrop\n| where event_source=\"sqs.amazonaws.com\" and !(src_ip matches \"*.amazonaws.com\")\n| if(event_name=\"CreateQueue\", queueUrlRes, queueUrlReq) as queueUrl \n| parse regex field=queueUrl \"(?[^\\/]*$)\"\n| if (isBlank(recipient_account_id), accountid, recipient_account_id) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| count as ip_count by src_ip\n| threatlookup singleIndicator src_ip\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as malicious_confidence\n| where malicious_confidence=\"high\"\n| sort by ip_count, src_ip\n| fields src_ip, malicious_confidence, actor" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "threshold": 0, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "threshold": 0, + "thresholdType": "LessThanOrEqual", + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High Engine CPU Utilization", - "description": "This alert fires when the average CPU utilization for the Redis engine process within a 5 minute interval is high (>=90%). For larger node types with four vCPUs or more, use the EngineCPUUtilization metric to monitor and set thresholds for scaling.", + "name": "AWS EC2 - High Total CPU Utilization", + "description": "This alert fires when the average total CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "0m", @@ -57,627 +60,720 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=EngineCPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" + "query": "Namespace=aws/ec2 metric=CPU_Total account=* region=* instanceid=* | avg by account, region, namespace, instanceid" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, + "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, + "threshold": 85, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High Memory Utilization", - "description": "This alert fires when the average memory utilization within a 5 minute interval for an EC2 instance is high (>=85%).", + "name": "Amazon RDS - High Write Latency", + "description": "This alert fires when the average write latency of a database within a 5 minute interval is high (>=5 seconds) . High write latencies will affect the performance of your application.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=Mem_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid" + "query": "Namespace=aws/rds metric=WriteLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - Low Redis Cache Hit Rate", - "description": "This alert fires when the average cache hit rate for Redis within a 5 minute interval is low (<= 80%). This indicates low efficiency of the Redis instance. If cache ratio is lower than 80%, that indicates a significant amount of keys are either evicted, expired, or don't exist.", + "name": "Amazon RDS - Low Aurora Buffer Cache Hit Ratio", + "description": "This alert fires when the average RDS Aurora buffer cache hit ratio within a 5 minute interval is low (<= 50%). This indicates that a lower percentage of requests were are served by the buffer cache, which could further indicate a degradation in application performance.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=CacheHitRate statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" + "query": "Namespace=aws/rds metric=BufferCacheHitRatio statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 50, "thresholdType": "LessThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 50, "thresholdType": "GreaterThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High 5XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", + "name": "Amazon Elasticache - High Redis Memory Fragmentation Ratio", + "description": "This alert fires when the average Redis memory fragmentation ratio for within a 5 minute interval is high (>=1.5). Value equal to or greater than 1.5 Indicate significant memory fragmentation.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway metric=5xxError Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/apigateway metric=count Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along account, region, namespace" + "query": "Namespace=aws/elasticache metric=MemoryFragmentationRatio statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 1.5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 1.5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - High Latency", - "description": "This alert fires when we detect that the average latency for a given Application load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", + "name": "Amazon ECS - High CPU Utilization", + "description": "This alert fires when the average CPU utilization within a 5 minute interval for a service within a cluster is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/applicationelb metric=TargetResponseTime Statistic=Average account=* region=* loadbalancer=* | eval(_value*1000) | sum by account, region, namespace, loadbalancer" + "query": "Namespace=aws/ecs metric=CPUUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 85, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon ECS - High Memory Utilization", - "description": "This alert fires when the average memory utilization within a 5 minute interval for a service within a cluster is high (>=85%).", + "name": "AWS SNS - Failed Notifications", + "description": "This alert fires where there are many failed notifications (>=5) within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ecs metric=MemoryUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" + "query": "account=* region=* namespace=aws/sns TopicName=* metric=NumberOfNotificationsFailed Statistic=Sum \n| sum by account, region, TopicName" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "threshold": 2, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "LessThan", - "field": null, + "threshold": 2, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High Write Latency", - "description": "This alert fires when the average write latency of a database within a 5 minute interval is high (>=5 seconds) . High write latencies will affect the performance of your application.", + "name": "Amazon RDS - High Read Latency", + "description": "This alert fires when the average read latency of a database within a 5 minutes time inerval is high (>=5 seconds). High read latency will affect the performance of your application.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=WriteLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/rds metric=ReadLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - High 4XX Errors", + "name": "AWS Classic Load Balancer - High 4XX Errors", "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_4XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + "query": "Namespace=aws/elb metric=HTTPCode_ELB_4XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" }, { "rowId": "B", - "query": "Namespace=aws/applicationelb metric=RequestCount Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" }, { "rowId": "C", - "query": "#A * 100 / #B along loadbalancer, account, region, namespace" + "query": "#A * 100 / #B along loadbalancername, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - Low Burst Balance", - "description": "This alert fires when we observe a low burst balance (<= 50%) for a given database. A low burst balance indicates you won't be able to scale up as fast for burstable database workloads on gp2 volumes.", + "name": "Amazon Elasticache - High Redis Database Memory Usage", + "description": "This alert fires when the average database memory usage within a 5 minute interval for the Redis engine is high (>=95%). When the value reaches 100%, eviction may happen or write operations may fail based on ElastiCache policies thereby impacting application performance.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=BurstBalance statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/elasticache metric=DatabaseMemoryUsagePercentage statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "LessThanOrEqual", - "field": null, + "threshold": 95, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "GreaterThan", - "field": null, + "threshold": 95, + "thresholdType": "LessThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - Access from Highly Malicious Sources", - "description": "This alert fires when the Classic load balancer is accessed from highly malicious IP addresses within last 5 minutes.", + "name": "AWS SQS - Messages not processed", + "description": "This alert fires when we detect messages that have been received by a consumer, but have not been processed (deleted/failed). That is, the average number of messages that are in flight are >=20 for an interval of 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/elb\n| parse \"* * * * * * * * * * * \\\"*\\\" \\\"*\\\" * *\" as datetime, loadbalancername, client, backend, request_processing_time, backend_processing_time, response_processing_time, elb_status_code, backend_status_code, received_bytes, sent_bytes, request, user_agent, ssl_cipher, ssl_protocol\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancername, account, region, namespace\n| lookup type, actor, raw, threatlevel as MaliciousConfidence from sumo://threat/cs on threat=ClientIp \n| json field=raw \"labels[*].name\" as LabelName \n| replace(LabelName, \"\\\\/\",\"->\") as LabelName\n| replace(LabelName, \"\\\"\",\" \") as LabelName\n| where type=\"ip_address\" and MaliciousConfidence=\"high\"\n| if (isEmpty(actor), \"Unassigned\", actor) as Actor\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancername, account, region, namespace, MaliciousConfidence, Actor, LabelName" + "query": "metric=ApproximateNumberOfMessagesNotVisible Statistic=avg region = * account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", - "field": "" + "threshold": 20, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", - "field": "" + "threshold": 20, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, - "playbook": "" + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - High 5XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", + "name": "AWS DynamoDB - High Max Provisioned Table Read Capacity", + "description": "This alert fires when we detect that the average percentage of read provisioned capacity used by the highest read provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/elb metric=HTTPCode_ELB_5XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along loadbalancername, account, region, namespace" + "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" } ], "triggers": [ { "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 80, "thresholdType": "GreaterThanOrEqual", - "occurrenceType": "Always" + "occurrenceType": "Always", + "minDataPoints": 2 }, { "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 80, "thresholdType": "LessThan", - "occurrenceType": "Always" + "occurrenceType": "Always", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Network Load Balancer - High Unhealthy Hosts", - "description": "This alert fires when we detect that are there are too many unhealthy hosts (>=10%) within an interval of 5 minutes for a given network load balancer", + "name": "AWS DynamoDB - High Max Provisioned Table Write Capacity", + "description": "This alert fires when we detect that the average percentage of write provisioned capacity used by the highest write provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/NetworkELB metric=UnHealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/NetworkELB metric=HealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / (#A + #B) along LoadBalancer, AvailabilityZone, account, region, namespace" + "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, + "threshold": 80, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, + "threshold": 80, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High Integration Latency", - "description": "This alert fires when we detect that the average integration latency for a given API Gateway is greater than or equal to one second for 5 minutes.", + "name": "Amazon RDS - Low Burst Balance", + "description": "This alert fires when we observe a low burst balance (<= 50%) for a given database. A low burst balance indicates you won't be able to scale up as fast for burstable database workloads on gp2 volumes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway metric=IntegrationLatency statistic=Average account=* region=* apiname=* | avg by apiname, namespace, region, account" + "query": "Namespace=aws/rds metric=BurstBalance statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "threshold": 50, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, - "thresholdType": "LessThan", - "field": null, + "threshold": 50, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Lambda - High Percentage of Failed Requests", - "description": "This alert fires when we detect a large number of failed Lambda requests (>5%) within an interval of 5 minutes.", + "name": "AWS SQS - Queue has stopped receiving messages", + "description": "This alert fires when we detect that the queue has stopped receiving messages. That is, the average number of messages received in the queue <1 for an interval of 30 minutes", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/lambda metric=Errors Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/lambda metric=Invocations Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along functionname, account, region, namespace" + "query": "metric=NumberOfMessagesReceived Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", - "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "resolutionWindow": null, + "timeRange": "-30m", + "threshold": 1, + "thresholdType": "LessThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", - "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThan", - "field": null, + "resolutionWindow": null, + "timeRange": "-30m", + "threshold": 1, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - High 5XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", + "name": "Amazon RDS - High Disk Queue Depth", + "description": "This alert fires when the average disk queue depth for a database is high (>=5) for an interval of 5 minutes. Higher this value, higher will be the number of outstanding I/Os (read/write requests) waiting to access the disk, which will impact the performance of your application.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_5XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + "query": "Namespace=aws/rds metric=DiskQueueDepth statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + } + ], + "triggers": [ + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 5, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 + }, + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 5, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "AWS Application Load Balancer - High 5XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Metrics", + "evaluationDelay": "4m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_5XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" }, { "rowId": "B", @@ -690,36 +786,41 @@ ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High Redis Memory Fragmentation Ratio", - "description": "This alert fires when the average Redis memory fragmentation ratio for within a 5 minute interval is high (>=1.5). Value equal to or greater than 1.5 Indicate significant memory fragmentation.", + "name": "AWS SNS - Failed Events", + "description": "This alert fires when an SNS app has high number of failed events (>5) within last 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", + "monitorType": "Logs", "evaluationDelay": "0m", "alertName": null, "runAs": null, @@ -727,253 +828,591 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=MemoryFragmentationRatio statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" + "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" errorCode \n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop \n| where event_source = \"sns.amazonaws.com\" and !isblank(error_code) \n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop \n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop \n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop \n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn \n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn \n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop \n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop \n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname \n| if (isBlank(accountid), recipient_account_id, accountid) as accountid \n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| if (isEmpty(username), user, username) as user \n| count as event_count by event_name, error_code, error_message, region, src_ip, accountid, user, type, request_id, topicname, topic_arn, user_agent" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1.5, + "threshold": 5, + "thresholdType": "GreaterThan", + "field": null + }, + { + "detectionMethod": "LogsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 5, + "thresholdType": "LessThanOrEqual", + "field": null + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "AWS API Gateway - High 4XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Metrics", + "evaluationDelay": "4m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "Namespace=aws/apigateway metric=4xxError Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/apigateway metric=count Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along apiname, account, region, namespace" + } + ], + "triggers": [ + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1.5, + "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High CPU Utilization", - "description": "This alert fires when the average CPU utilization within a 5 minute interval for a host is high (>=90%). The CPUUtilization metric includes total CPU utilization across application, operating system and management processes. We highly recommend monitoring CPU utilization for hosts with two vCPUs or less.", + "name": "AWS Network Load Balancer - High TLS Negotiation Errors", + "description": "This alert fires when we detect that there are too many TLS Negotiation Errors (>=10%) within an interval of 5 minutes for a given network load balancer", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=CPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" + "query": "Namespace=aws/NetworkELB metric=ClientTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/NetworkELB metric=TargetTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" + }, + { + "rowId": "C", + "query": "(#A + #B) along LoadBalancer, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, + "threshold": 10, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, + "threshold": 10, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - Low Aurora Buffer Cache Hit Ratio", - "description": "This alert fires when the average RDS Aurora buffer cache hit ratio within a 5 minute interval is low (<= 50%). This indicates that a lower percentage of requests were are served by the buffer cache, which could further indicate a degradation in application performance.", + "name": "AWS DynamoDB - High Account Provisioned Write Capacity", + "description": "This alert fires when we detect that the average write capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=BufferCacheHitRatio statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/dynamodb metric=AccountProvisionedWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + } + ], + "triggers": [ + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 80, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 + }, + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 80, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "AWS API Gateway - High Integration Latency", + "description": "This alert fires when we detect that the average integration latency for a given API Gateway is greater than or equal to one second for 5 minutes.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Metrics", + "evaluationDelay": "4m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "Namespace=aws/apigateway metric=IntegrationLatency statistic=Average account=* region=* apiname=* | avg by apiname, namespace, region, account" + } + ], + "triggers": [ + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 1000, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 + }, + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 1000, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "AWS Classic Load Balancer - High Latency", + "description": "This alert fires when we detect that the average latency for a given Classic load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Metrics", + "evaluationDelay": "4m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "Namespace=aws/elb metric=Latency Statistic=Average account=* region=* loadbalancername=* | eval(_value*1000) | sum by account, region, namespace, loadbalancername" + } + ], + "triggers": [ + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 3000, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 + }, + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 3000, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "AWS Application Load Balancer - High Latency", + "description": "This alert fires when we detect that the average latency for a given Application load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Metrics", + "evaluationDelay": "4m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "Namespace=aws/applicationelb metric=TargetResponseTime Statistic=Average account=* region=* loadbalancer=* | eval(_value*1000) | sum by account, region, namespace, loadbalancer" + } + ], + "triggers": [ + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 3000, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 + }, + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 3000, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "AWS DynamoDB - High Account Provisioned Read Capacity", + "description": "This alert fires when we detect that the average read capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Metrics", + "evaluationDelay": "4m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "Namespace=aws/dynamodb metric=AccountProvisionedReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + } + ], + "triggers": [ + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "Critical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 80, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 + }, + { + "detectionMethod": "MetricsStaticCondition", + "triggerType": "ResolvedCritical", + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 80, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 + } + ], + "timeZone": null, + "notifications": [], + "isDisabled": true, + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] + }, + { + "name": "AWS Lambda - Low Provisioned Concurrency Utilization", + "description": "This alert fires when the average provisioned concurrency utilization for 5 minutes is low (<= 50%). This indicates low provisioned concurrency utilization efficiency.", + "type": "MonitorsLibraryMonitorExport", + "monitorType": "Metrics", + "evaluationDelay": "4m", + "alertName": null, + "runAs": null, + "notificationGroupFields": [], + "queries": [ + { + "rowId": "A", + "query": "Namespace=aws/lambda metric=ProvisionedConcurrencyUtilization statistic=Average account=* region=* functionname=* | avg by functionname, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 50, "thresholdType": "LessThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 50, "thresholdType": "GreaterThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High 4XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", + "name": "AWS Network Load Balancer - High Unhealthy Hosts", + "description": "This alert fires when we detect that are there are too many unhealthy hosts (>=10%) within an interval of 5 minutes for a given network load balancer", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway metric=4xxError Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + "query": "Namespace=aws/NetworkELB metric=UnHealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" }, { "rowId": "B", - "query": "Namespace=aws/apigateway metric=count Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + "query": "Namespace=aws/NetworkELB metric=HealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" }, { "rowId": "C", - "query": "#A * 100 / #B along apiname, account, region, namespace" + "query": "#A * 100 / (#A + #B) along LoadBalancer, AvailabilityZone, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 10, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 10, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High Total CPU Utilization", - "description": "This alert fires when the average total CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", + "name": "AWS Classic Load Balancer - High 5XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=CPU_Total account=* region=* instanceid=* | avg by account, region, namespace, instanceid" + "query": "Namespace=aws/elb metric=HTTPCode_ELB_5XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along loadbalancername, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - High Latency", - "description": "This alert fires when we detect that the average latency for a given Classic load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", + "name": "Amazon Elasticache - High Engine CPU Utilization", + "description": "This alert fires when the average CPU utilization for the Redis engine process within a 5 minute interval is high (>=90%). For larger node types with four vCPUs or more, use the EngineCPUUtilization metric to monitor and set thresholds for scaling.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/elb metric=Latency Statistic=Average account=* region=* loadbalancername=* | eval(_value*1000) | sum by account, region, namespace, loadbalancername" + "query": "Namespace=aws/elasticache metric=EngineCPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" } ], "triggers": [ { "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 90, "thresholdType": "GreaterThanOrEqual", - "occurrenceType": "Always" + "occurrenceType": "Always", + "minDataPoints": 2 }, { "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 90, "thresholdType": "LessThan", - "occurrenceType": "Always" + "occurrenceType": "Always", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High Disk Utilization", - "description": "This alert fires when the average disk utilization within a 5 minute time interval for an EC2 instance is high (>=85%).", + "name": "AWS EC2 - High Memory Utilization", + "description": "This alert fires when the average memory utilization within a 5 minute interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "0m", @@ -983,131 +1422,146 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=Disk_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid, devname" + "query": "Namespace=aws/ec2 metric=Mem_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 85, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon ECS - High CPU Utilization", - "description": "This alert fires when the average CPU utilization within a 5 minute interval for a service within a cluster is high (>=85%).", + "name": "Amazon ECS - High Memory Utilization", + "description": "This alert fires when the average memory utilization within a 5 minute interval for a service within a cluster is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ecs metric=CPUUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" + "query": "Namespace=aws/ecs metric=MemoryUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 85, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Network Load Balancer - High TLS Negotiation Errors", - "description": "This alert fires when we detect that there are too many TLS Negotiation Errors (>=10%) within an interval of 5 minutes for a given network load balancer", + "name": "AWS Lambda - High Percentage of Failed Requests", + "description": "This alert fires when we detect a large number of failed Lambda requests (>5%) within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/NetworkELB metric=ClientTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" + "query": "Namespace=aws/lambda metric=Errors Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" }, { "rowId": "B", - "query": "Namespace=aws/NetworkELB metric=TargetTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" + "query": "Namespace=aws/lambda metric=Invocations Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" }, { "rowId": "C", - "query": "(#A + #B) along LoadBalancer, account, region, namespace" + "query": "#A * 100 / #B along functionname, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, + "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, + "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - Multiple Failed Operations", - "description": "This alert fires when we detect multiple failed operations within a 15 minute interval for an ElastiCache service.", + "name": "AWS DynamoDB - Multiple Tables deleted", + "description": "This alert fires when five or more tables are deleted within 15 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Logs", "evaluationDelay": "0m", @@ -1117,41 +1571,44 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/elasticache \"\\\"eventSource\\\":\\\"elasticache.amazonaws.com\\\"\" errorCode errorMessage\n| json \"eventSource\", \"errorCode\", \"errorMessage\", \"userIdentity\", \"requestParameters\", \"responseElements\" as event_source, error_code, error_message, user_identity, requestParameters, responseElements nodrop\n| json field=requestParameters \"cacheClusterId\" as req_cacheClusterId nodrop\n| json field=responseElements \"cacheClusterId\" as res_cacheClusterId nodrop\n| json field=user_identity \"arn\", \"userName\" nodrop \n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop\n| if (isEmpty(userName), user, userName) as user\n| if (isEmpty(req_cacheClusterId), res_cacheClusterId, req_cacheClusterId) as cacheclusterid\n| where event_source matches \"elasticache.amazonaws.com\" and !isEmpty(error_code) and !isEmpty(error_message) and !isEmpty(user)\n| count as event_count by _messageTime, account, region, event_source, error_code, error_message, user, cacheclusterid\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, event_source, error_code, error_message, user, cacheclusterid\n| fields -_messageTime" + "query": "account=* region=* namespace=aws/dynamodb eventSource \"dynamodb.amazonaws.com\"\n| json \"eventSource\", \"eventName\", \"requestParameters.tableName\", \"sourceIPAddress\", \"userIdentity.userName\", \"userIdentity.sessionContext.sessionIssuer.userName\" as event_source, event_name, tablename, SourceIp, UserName, ContextUserName nodrop\n| where event_source = \"dynamodb.amazonaws.com\" and event_name = \"DeleteTable\"\n| if (isEmpty(UserName), ContextUserName, UserName) as user\n| count by _messageTime, account, region, namespace, event_name, user, tablename\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, namespace, event_name, user, tablename\n| fields -_messageTime" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-15m", - "threshold": 10, + "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults" + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-15m", - "threshold": 10, + "threshold": 5, "thresholdType": "LessThan", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults" + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, - "playbook": "" + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High Redis Database Memory Usage", - "description": "This alert fires when the average database memory usage within a 5 minute interval for the Redis engine is high (>=95%). When the value reaches 100%, eviction may happen or write operations may fail based on ElastiCache policies thereby impacting application performance.", + "name": "Amazon Elasticache - Multiple Failed Operations", + "description": "This alert fires when we detect multiple failed operations within a 15 minute interval for an ElastiCache service.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", + "monitorType": "Logs", "evaluationDelay": "0m", "alertName": null, "runAs": null, @@ -1159,83 +1616,91 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=DatabaseMemoryUsagePercentage statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" + "query": "account=* region=* namespace=aws/elasticache \"\\\"eventSource\\\":\\\"elasticache.amazonaws.com\\\"\" errorCode errorMessage\n| json \"eventSource\", \"errorCode\", \"errorMessage\", \"userIdentity\", \"requestParameters\", \"responseElements\" as event_source, error_code, error_message, user_identity, requestParameters, responseElements nodrop\n| json field=requestParameters \"cacheClusterId\" as req_cacheClusterId nodrop\n| json field=responseElements \"cacheClusterId\" as res_cacheClusterId nodrop\n| json field=user_identity \"arn\", \"userName\" nodrop \n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop\n| if (isEmpty(userName), user, userName) as user\n| if (isEmpty(req_cacheClusterId), res_cacheClusterId, req_cacheClusterId) as cacheclusterid\n| where event_source matches \"elasticache.amazonaws.com\" and !isEmpty(error_code) and !isEmpty(error_message) and !isEmpty(user)\n| count as event_count by _messageTime, account, region, event_source, error_code, error_message, user, cacheclusterid\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, event_source, error_code, error_message, user, cacheclusterid\n| fields -_messageTime" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", - "timeRange": "-5m", - "threshold": 95, + "resolutionWindow": null, + "timeRange": "-15m", + "threshold": 10, "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", - "timeRange": "-5m", - "threshold": 95, + "resolutionWindow": null, + "timeRange": "-15m", + "threshold": 10, "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High System CPU Utilization", - "description": "This alert fires when the average system CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", + "name": "Amazon RDS - High CPU Utilization", + "description": "This alert fires when we detect that the average CPU utilization for a database is high (>=85%) for an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=CPU_Sys account=* region=* instanceid=* | avg by account, region, namespace, instanceid" + "query": "Namespace=aws/rds metric=CPUUtilization statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", "threshold": 85, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Lambda - Low Provisioned Concurrency Utilization", - "description": "This alert fires when the average provisioned concurrency utilization for 5 minutes is low (<= 50%). This indicates low provisioned concurrency utilization efficiency.", + "name": "AWS Classic Load Balancer - Access from Highly Malicious Sources", + "description": "This alert fires when the Classic load balancer is accessed from highly malicious IP addresses within last 5 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", + "monitorType": "Logs", "evaluationDelay": "0m", "alertName": null, "runAs": null, @@ -1243,41 +1708,44 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/lambda metric=ProvisionedConcurrencyUtilization statistic=Average account=* region=* functionname=* | avg by functionname, namespace, region, account" + "query": "account=* region=* namespace=aws/elb\n| parse \"* * * * * * * * * * * \\\"*\\\" \\\"*\\\" * *\" as datetime, loadbalancername, client, backend, request_processing_time, backend_processing_time, response_processing_time, elb_status_code, backend_status_code, received_bytes, sent_bytes, request, user_agent, ssl_cipher, ssl_protocol\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancername, account, region, namespace\n| threatlookup singleIndicator ClientIp\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as MaliciousConfidence\n| where MaliciousConfidence=\"high\"\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancername, account, region, namespace, MaliciousConfidence, Actor" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "LessThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "threshold": 0, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "GreaterThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "threshold": 0, + "thresholdType": "LessThanOrEqual", + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Max Provisioned Table Write Capacity", - "description": "This alert fires when we detect that the average percentage of write provisioned capacity used by the highest write provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "AWS SNS - Access from Highly Malicious Sources", + "description": "This alert fires when an Application AWS - SNS is accessed from highly malicious IP addresses within last 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", + "monitorType": "Logs", "evaluationDelay": "0m", "alertName": null, "runAs": null, @@ -1285,83 +1753,99 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop\n| where event_source = \"sns.amazonaws.com\" \n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, user_type, arn, username nodrop\n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountid, user nodrop\n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop\n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn\n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn\n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop\n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop\n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname\n| if (isBlank(accountid), recipient_account_id, accountid) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status\n| if (isEmpty(username), user_type, username) as user_type\n| count as ip_count by src_ip, event_name, region, accountid,user_type\n| threatlookup singleIndicator src_ip\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as malicious_confidence\n| where malicious_confidence=\"high\"\n| sum(ip_count) as threat_count by src_ip, event_name, region, accountid, malicious_confidence, actor" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "threshold": 0, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "threshold": 0, + "thresholdType": "LessThanOrEqual", + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Max Provisioned Table Read Capacity", - "description": "This alert fires when we detect that the average percentage of read provisioned capacity used by the highest read provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "AWS Application Load Balancer - High 4XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_4XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/applicationelb metric=RequestCount Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along loadbalancer, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - Multiple Tables deleted", - "description": "This alert fires when we detect multiple failed operations for Elasticache service within 15 minutes", + "name": "AWS EC2 - High System CPU Utilization", + "description": "This alert fires when the average system CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", + "monitorType": "Metrics", "evaluationDelay": "0m", "alertName": null, "runAs": null, @@ -1369,255 +1853,289 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/dynamodb eventSource \"dynamodb.amazonaws.com\"\n| json \"eventSource\", \"eventName\", \"requestParameters.tableName\", \"sourceIPAddress\", \"userIdentity.userName\", \"userIdentity.sessionContext.sessionIssuer.userName\" as event_source, event_name, tablename, SourceIp, UserName, ContextUserName nodrop\n| where event_source = \"dynamodb.amazonaws.com\" and event_name = \"DeleteTable\"\n| if (isEmpty(UserName), ContextUserName, UserName) as user\n| count by _messageTime, account, region, namespace, event_name, user, tablename\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, namespace, event_name, user, tablename\n| fields -_messageTime" + "query": "Namespace=aws/ec2 metric=CPU_Sys account=* region=* instanceid=* | avg by account, region, namespace, instanceid" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", - "timeRange": "-15m", - "threshold": 5, + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults" + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", - "timeRange": "-15m", - "threshold": 5, + "resolutionWindow": null, + "timeRange": "-5m", + "threshold": 85, "thresholdType": "LessThan", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults" + "occurrenceType": "Always", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, - "playbook": "" + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Account Provisioned Write Capacity", - "description": "This alert fires when we detect that the average write capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "AWS API Gateway - High Latency", + "description": "This alert fires when we detect that the average latency for a given API Gateway is greater than or equal to one second for 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=AccountProvisionedWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "Namespace=aws/apigateway metric=Latency statistic=Average account=* region=* apiname=* | avg by apiname, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 1000, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 1000, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High Read Latency", - "description": "This alert fires when the average read latency of a database within a 5 minutes time inerval is high (>=5 seconds). High read latency will affect the performance of your application.", + "name": "Amazon Elasticache - High CPU Utilization", + "description": "This alert fires when the average CPU utilization within a 5 minute interval for a host is high (>=90%). The CPUUtilization metric includes total CPU utilization across application, operating system and management processes. We highly recommend monitoring CPU utilization for hosts with two vCPUs or less.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=ReadLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/elasticache metric=CPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 90, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 90, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - Access from Highly Malicious Sources", - "description": "This alert fires when an Application load balancer is accessed from highly malicious IP addresses within last 5 minutes", + "name": "AWS API Gateway - High 5XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/applicationelb\n| parse \"* * * * * * * * * * * * \\\"*\\\" \\\"*\\\" * * * \\\"*\\\"\" as Type, DateTime, loadbalancer, Client, Target, RequestProcessingTime, TargetProcessingTime, ResponseProcessingTime, ElbStatusCode, TargetStatusCode, ReceivedBytes, SentBytes, Request, UserAgent, SslCipher, SslProtocol, TargetGroupArn, TraceId\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancer, account, region, namespace\n| lookup type, actor, raw, threatlevel as MaliciousConfidence from sumo://threat/cs on threat=ClientIp \n| json field=raw \"labels[*].name\" as LabelName \n| replace(LabelName, \"\\\\/\",\"->\") as LabelName\n| replace(LabelName, \"\\\"\",\" \") as LabelName\n| where type=\"ip_address\" and MaliciousConfidence=\"high\"\n| if (isEmpty(actor), \"Unassigned\", actor) as Actor\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancer, account, region, namespace, MaliciousConfidence, Actor, LabelName" + "query": "Namespace=aws/apigateway metric=5xxError Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/apigateway metric=count Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults" + "threshold": 5, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults" + "threshold": 5, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, - "playbook": "" + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - High 4XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", + "name": "Amazon Elasticache - Low Redis Cache Hit Rate", + "description": "This alert fires when the average cache hit rate for Redis within a 5 minute interval is low (<= 80%). This indicates low efficiency of the Redis instance. If cache ratio is lower than 80%, that indicates a significant amount of keys are either evicted, expired, or don't exist.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/elb metric=HTTPCode_ELB_4XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along loadbalancername, account, region, namespace" + "query": "Namespace=aws/elasticache metric=CacheHitRate statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" } ], "triggers": [ { "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThanOrEqual", - "occurrenceType": "Always" + "threshold": 80, + "thresholdType": "LessThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 }, { "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThan", - "occurrenceType": "Always" + "threshold": 80, + "thresholdType": "GreaterThan", + "occurrenceType": "Always", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High CPU Utilization", - "description": "This alert fires when we detect that the average CPU utilization for a database is high (>=85%) for an interval of 5 minutes.", + "name": "AWS SQS - Message processing not fast enough", + "description": "This alert fires when we detect message processing is not fast enough. That is, the average approximate age of the oldest non-deleted message in the queue is more than 5 seconds for an interval of 5 minutes", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=CPUUtilization statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "metric=ApproximateAgeOfOldestMessage Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account,region,namespace,queuename " } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "threshold": 5, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "LessThan", - "field": null, + "threshold": 5, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High Disk Queue Depth", - "description": "This alert fires when the average disk queue depth for a database is high (>=5) for an interval of 5 minutes. Higher this value, higher will be the number of outstanding I/Os (read/write requests) waiting to access the disk, which will impact the performance of your application.", + "name": "AWS Application Load Balancer - Access from Highly Malicious Sources", + "description": "This alert fires when an Application load balancer is accessed from highly malicious IP addresses within last 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", + "monitorType": "Logs", "evaluationDelay": "0m", "alertName": null, "runAs": null, @@ -1625,39 +2143,42 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=DiskQueueDepth statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "account=* region=* namespace=aws/applicationelb\n| parse \"* * * * * * * * * * * * \\\"*\\\" \\\"*\\\" * * * \\\"*\\\"\" as Type, DateTime, loadbalancer, Client, Target, RequestProcessingTime, TargetProcessingTime, ResponseProcessingTime, ElbStatusCode, TargetStatusCode, ReceivedBytes, SentBytes, Request, UserAgent, SslCipher, SslProtocol, TargetGroupArn, TraceId\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancer, account, region, namespace\n| threatlookup singleIndicator ClientIp\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as MaliciousConfidence\n| where MaliciousConfidence=\"high\"\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancer, account, region, namespace, MaliciousConfidence, Actor" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "threshold": 0, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "threshold": 0, + "thresholdType": "LessThanOrEqual", + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Account Provisioned Read Capacity", - "description": "This alert fires when we detect that the average read capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "AWS EC2 - High Disk Utilization", + "description": "This alert fires when the average disk utilization within a 5 minute time interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "0m", @@ -1667,35 +2188,40 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=AccountProvisionedReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "Namespace=aws/ec2 metric=Disk_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid, devname" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", + "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 85, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries" + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, - "playbook": "" + "groupNotifications": true, + "playbook": "", + "sloId": null, + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] } ] -} +} \ No newline at end of file diff --git a/aws-observability/json/Alerts-App.json b/aws-observability/json/Alerts-App.json index e60038f0..fb96a1e4 100644 --- a/aws-observability/json/Alerts-App.json +++ b/aws-observability/json/Alerts-App.json @@ -1,67 +1,56 @@ { "name": "AWS Observability", - "description": "", + "description": "This folder contains all the monitors for AWS Observability solution.", "type": "MonitorsLibraryFolderExport", "children": [ { - "name": "AWS API Gateway - High 5XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", + "name": "AWS SQS - Access from Highly Malicious Sources", + "description": "This alert fires when an Application AWS - SQS is accessed from highly malicious IP addresses within last 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway metric=5xxError Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/apigateway metric=count Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along account, region, namespace" + "query": "account=* region=* namespace=\"aws/sqs\" eventname eventsource \"sqs.amazonaws.com\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"sourceIPAddress\",\"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, recipient_account_id, requestParameters, responseElements, src_ip, error_code, error_message nodrop\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| json field=requestParameters \"queueUrl\" as queueUrlReq nodrop \n| json field=responseElements \"queueUrl\" as queueUrlRes nodrop\n| where event_source=\"sqs.amazonaws.com\" and !(src_ip matches \"*.amazonaws.com\")\n| if(event_name=\"CreateQueue\", queueUrlRes, queueUrlReq) as queueUrl \n| parse regex field=queueUrl \"(?[^\\/]*$)\"\n| if (isBlank(recipient_account_id), accountid, recipient_account_id) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| count as ip_count by src_ip\n| threatlookup singleIndicator src_ip\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as malicious_confidence\n| where malicious_confidence=\"high\"\n| sort by ip_count, src_ip\n| fields src_ip, malicious_confidence, actor" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 0, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 0, + "thresholdType": "LessThanOrEqual", + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High System CPU Utilization", - "description": "This alert fires when the average system CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", + "name": "AWS EC2 - High Total CPU Utilization", + "description": "This alert fires when the average total CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "0m", @@ -71,87 +60,91 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=CPU_Sys account=* region=* instanceid=* | avg by account, region, namespace, instanceid" + "query": "Namespace=aws/ec2 metric=CPU_Total account=* region=* instanceid=* | avg by account, region, namespace, instanceid" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 85, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS SNS - Failed Events", - "description": "This alert fires when an SNS app has high number of failed events (>5) within last 5 minutes", + "name": "Amazon RDS - High Write Latency", + "description": "This alert fires when the average write latency of a database within a 5 minute interval is high (>=5 seconds) . High write latencies will affect the performance of your application.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" errorCode\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop\n| where event_source = \"sns.amazonaws.com\" and !isblank(error_code)\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop\n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop\n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn\n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn\n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop\n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop\n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname\n| if (isBlank(accountid), recipient_account_id, accountid) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status\n| if (isEmpty(username), user, username) as user\n| count as event_count by event_name, error_code, error_message, region, src_ip, accountid, user, type, request_id, topicname, topic_arn, user_agent\n" + "query": "Namespace=aws/rds metric=WriteLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, - "thresholdType": "GreaterThan", - "field": null + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, - "thresholdType": "LessThanOrEqual", - "field": null + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Account Provisioned Write Capacity", - "description": "This alert fires when we detect that the average write capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "Amazon RDS - Low Aurora Buffer Cache Hit Ratio", + "description": "This alert fires when the average RDS Aurora buffer cache hit ratio within a 5 minute interval is low (<= 50%). This indicates that a lower percentage of requests were are served by the buffer cache, which could further indicate a degradation in application performance.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -161,87 +154,91 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=AccountProvisionedWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "Namespace=aws/rds metric=BufferCacheHitRatio statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "threshold": 50, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, - "thresholdType": "LessThan", - "field": null, + "threshold": 50, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS SNS - Access from Highly Malicious Sources", - "description": "This alert fires when an Application AWS - SNS is accessed from highly malicious IP addresses within last 5 minutes", + "name": "Amazon Elasticache - High Redis Memory Fragmentation Ratio", + "description": "This alert fires when the average Redis memory fragmentation ratio for within a 5 minute interval is high (>=1.5). Value equal to or greater than 1.5 Indicate significant memory fragmentation.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop\n| where event_source = \"sns.amazonaws.com\" \n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, user_type, arn, username nodrop\n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountid, user nodrop\n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop\n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn\n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn\n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop\n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop\n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname\n| if (isBlank(accountid), recipient_account_id, accountid) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status\n| if (isEmpty(username), user_type, username) as user_type\n| count as ip_count by src_ip, event_name, region, accountid,user_type\n| lookup type, actor, raw, threatlevel as malicious_confidence from sumo://threat/cs on threat=src_ip\n| where type=\"ip_address\" and malicious_confidence = \"high\"\n| json field=raw \"labels[*].name\" as label_name \n| replace(label_name, \"\\\\/\",\"->\") as label_name\n| replace(label_name, \"\\\"\",\" \") as label_name\n| if (isEmpty(actor), \"Unassigned\", actor) as actor\n| sum(ip_count) as threat_count by src_ip, event_name, region, accountid, malicious_confidence, actor, label_name\n" + "query": "Namespace=aws/elasticache metric=MemoryFragmentationRatio statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", - "field": null + "threshold": 1.5, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", - "field": null + "threshold": 1.5, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High Integration Latency", - "description": "This alert fires when we detect that the average integration latency for a given API Gateway is greater than or equal to one second for 5 minutes.", + "name": "Amazon ECS - High CPU Utilization", + "description": "This alert fires when the average CPU utilization within a 5 minute interval for a service within a cluster is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -251,45 +248,44 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway metric=IntegrationLatency statistic=Average account=* region=* apiname=* | avg by apiname, namespace, region, account" + "query": "Namespace=aws/ecs metric=CPUUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, + "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, + "threshold": 85, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Lambda - High Percentage of Failed Requests", - "description": "This alert fires when we detect a large number of failed Lambda requests (>5%) within an interval of 5 minutes.", + "name": "AWS SNS - Failed Notifications", + "description": "This alert fires where there are many failed notifications (>=5) within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -299,53 +295,44 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/lambda metric=Errors Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/lambda metric=Invocations Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along functionname, account, region, namespace" + "query": "account=* region=* namespace=aws/sns TopicName=* metric=NumberOfNotificationsFailed Statistic=Sum \n| sum by account, region, TopicName" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "threshold": 2, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThan", - "field": null, + "threshold": 2, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - Low Aurora Buffer Cache Hit Ratio", - "description": "This alert fires when the average RDS Aurora buffer cache hit ratio within a 5 minute interval is low (<= 50%). This indicates that a lower percentage of requests were are served by the buffer cache, which could further indicate a degradation in application performance.", + "name": "Amazon RDS - High Read Latency", + "description": "This alert fires when the average read latency of a database within a 5 minutes time inerval is high (>=5 seconds). High read latency will affect the performance of your application.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -355,45 +342,44 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=BufferCacheHitRatio statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/rds metric=ReadLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "LessThanOrEqual", - "field": null, + "threshold": 5, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "GreaterThan", - "field": null, + "threshold": 5, + "thresholdType": "LessThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Lambda - Low Provisioned Concurrency Utilization", - "description": "This alert fires when the average provisioned concurrency utilization for 5 minutes is low (<= 50%). This indicates low provisioned concurrency utilization efficiency.", + "name": "AWS Classic Load Balancer - High 4XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -403,45 +389,52 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/lambda metric=ProvisionedConcurrencyUtilization statistic=Average account=* region=* functionname=* | avg by functionname, namespace, region, account" + "query": "Namespace=aws/elb metric=HTTPCode_ELB_4XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along loadbalancername, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "LessThanOrEqual", - "field": null, + "threshold": 5, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "GreaterThan", - "field": null, + "threshold": 5, + "thresholdType": "LessThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - Low Redis Cache Hit Rate", - "description": "This alert fires when the average cache hit rate for Redis within a 5 minute interval is low (<= 80%). This indicates low efficiency of the Redis instance. If cache ratio is lower than 80%, that indicates a significant amount of keys are either evicted, expired, or don't exist.", + "name": "Amazon Elasticache - High Redis Database Memory Usage", + "description": "This alert fires when the average database memory usage within a 5 minute interval for the Redis engine is high (>=95%). When the value reaches 100%, eviction may happen or write operations may fail based on ElastiCache policies thereby impacting application performance.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -451,45 +444,44 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=CacheHitRate statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" + "query": "Namespace=aws/elasticache metric=DatabaseMemoryUsagePercentage statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, - "thresholdType": "LessThanOrEqual", - "field": null, + "threshold": 95, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, - "thresholdType": "GreaterThan", - "field": null, + "threshold": 95, + "thresholdType": "LessThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS SQS - Message processing not fast enough", - "description": "This alert fires when we detect message processing is not fast enough. That is, the average approximate age of the oldest non-deleted message in the queue is more than 5 seconds for an interval of 5 minutes.", + "name": "AWS SQS - Messages not processed", + "description": "This alert fires when we detect messages that have been received by a consumer, but have not been processed (deleted/failed). That is, the average number of messages that are in flight are >=20 for an interval of 5 minutes", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -499,7 +491,7 @@ "queries": [ { "rowId": "A", - "query": "metric=ApproximateAgeOfOldestMessage Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account,region,namespace,queuename " + "query": "metric=ApproximateNumberOfMessagesNotVisible Statistic=avg region = * account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " } ], "triggers": [ @@ -508,80 +500,82 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThan", + "threshold": 20, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "minDataPoints": 3 + "minDataPoints": 2 }, { "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThanOrEqual", + "threshold": 20, + "thresholdType": "LessThan", "occurrenceType": "Always", - "minDataPoints": 3 + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High Memory Utilization", - "description": "This alert fires when the average memory utilization within a 5 minute interval for an EC2 instance is high (>=85%).", + "name": "AWS DynamoDB - High Max Provisioned Table Read Capacity", + "description": "This alert fires when we detect that the average percentage of read provisioned capacity used by the highest read provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=Mem_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid" + "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 80, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 80, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS SNS - Failed Notifications", - "description": "This alert fires where there are many failed notifications (>2) within an interval of 5 minutes.", + "name": "AWS DynamoDB - High Max Provisioned Table Write Capacity", + "description": "This alert fires when we detect that the average percentage of write provisioned capacity used by the highest write provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -591,7 +585,7 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/sns TopicName=* metric=NumberOfNotificationsFailed Statistic=Sum | sum by account, region, TopicName" + "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" } ], "triggers": [ @@ -600,8 +594,8 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 2, - "thresholdType": "GreaterThan", + "threshold": 80, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", "minDataPoints": 2 }, @@ -610,22 +604,25 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 2, - "thresholdType": "LessThanOrEqual", + "threshold": 80, + "thresholdType": "LessThan", "occurrenceType": "Always", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High Read Latency", - "description": "This alert fires when the average read latency of a database within a 5 minutes time inerval is high (>=5 seconds). High read latency will affect the performance of your application.", + "name": "Amazon RDS - Low Burst Balance", + "description": "This alert fires when we observe a low burst balance (<= 50%) for a given database. A low burst balance indicates you won't be able to scale up as fast for burstable database workloads on gp2 volumes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -635,45 +632,44 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=ReadLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/rds metric=BurstBalance statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "threshold": 50, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, - "thresholdType": "LessThan", - "field": null, + "threshold": 50, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS SQS - Messages not processed", - "description": "This alert fires when we detect messages that have been received by a consumer, but have not been processed (deleted/failed). That is, the average number of messages that are in flight are >=20 for an interval of 5 minutes.", + "name": "AWS SQS - Queue has stopped receiving messages", + "description": "This alert fires when we detect that the queue has stopped receiving messages. That is, the average number of messages received in the queue <1 for an interval of 30 minutes", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -683,7 +679,7 @@ "queries": [ { "rowId": "A", - "query": "metric=ApproximateNumberOfMessagesNotVisible Statistic=avg region = * account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " + "query": "metric=NumberOfMessagesReceived Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " } ], "triggers": [ @@ -691,33 +687,36 @@ "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 20, - "thresholdType": "GreaterThanOrEqual", + "timeRange": "-30m", + "threshold": 1, + "thresholdType": "LessThan", "occurrenceType": "Always", - "minDataPoints": 3 + "minDataPoints": 2 }, { "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 20, - "thresholdType": "LessThan", + "timeRange": "-30m", + "threshold": 1, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "minDataPoints": 3 + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High 4XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", + "name": "Amazon RDS - High Disk Queue Depth", + "description": "This alert fires when the average disk queue depth for a database is high (>=5) for an interval of 5 minutes. Higher this value, higher will be the number of outstanding I/Os (read/write requests) waiting to access the disk, which will impact the performance of your application.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -727,53 +726,44 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway metric=4xxError Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/apigateway metric=count Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along apiname, account, region, namespace" + "query": "Namespace=aws/rds metric=DiskQueueDepth statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - High 4XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", + "name": "AWS Application Load Balancer - High 5XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -783,7 +773,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_4XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_5XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" }, { "rowId": "B", @@ -796,96 +786,84 @@ ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Network Load Balancer - High TLS Negotiation Errors", - "description": "This alert fires when we detect that there are too many TLS Negotiation Errors (>=10%) within an interval of 5 minutes for a given network load balancer", + "name": "AWS SNS - Failed Events", + "description": "This alert fires when an SNS app has high number of failed events (>5) within last 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/NetworkELB metric=ClientTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/NetworkELB metric=TargetTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" - }, - { - "rowId": "C", - "query": "(#A + #B) along LoadBalancer, account, region, namespace" + "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" errorCode \n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop \n| where event_source = \"sns.amazonaws.com\" and !isblank(error_code) \n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop \n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop \n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop \n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn \n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn \n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop \n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop \n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname \n| if (isBlank(accountid), recipient_account_id, accountid) as accountid \n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| if (isEmpty(username), user, username) as user \n| count as event_count by event_name, error_code, error_message, region, src_ip, accountid, user, type, request_id, topicname, topic_arn, user_agent" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 5, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 5, + "thresholdType": "LessThanOrEqual", + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS SQS - Queue has stopped receiving messages", - "description": "This alert fires when we detect that the queue has stopped receiving messages. That is, the average number of messages received in the queue <1 for an interval of 30 minutes.", + "name": "AWS API Gateway - High 4XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -895,7 +873,15 @@ "queries": [ { "rowId": "A", - "query": "metric=NumberOfMessagesReceived Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account, region, namespace, queuename " + "query": "Namespace=aws/apigateway metric=4xxError Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/apigateway metric=count Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along apiname, account, region, namespace" } ], "triggers": [ @@ -903,129 +889,138 @@ "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-30m", - "threshold": 1, - "thresholdType": "LessThan", + "timeRange": "-5m", + "threshold": 5, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "minDataPoints": 3 + "minDataPoints": 2 }, { "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-30m", - "threshold": 1, - "thresholdType": "GreaterThanOrEqual", + "timeRange": "-5m", + "threshold": 5, + "thresholdType": "LessThan", "occurrenceType": "Always", - "minDataPoints": 3 + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High Disk Utilization", - "description": "This alert fires when the average disk utilization within a 5 minute time interval for an EC2 instance is high (>=85%).", + "name": "AWS Network Load Balancer - High TLS Negotiation Errors", + "description": "This alert fires when we detect that there are too many TLS Negotiation Errors (>=10%) within an interval of 5 minutes for a given network load balancer", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "0m", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=Disk_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid, devname" + "query": "Namespace=aws/NetworkELB metric=ClientTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/NetworkELB metric=TargetTLSNegotiationErrorCount Statistic=sum account=* region=* LoadBalancer=* | sum by LoadBalancer, account, region, namespace" + }, + { + "rowId": "C", + "query": "(#A + #B) along LoadBalancer, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 10, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 10, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - Multiple Tables deleted", - "description": "This alert fires when five or more tables are deleted within 15 minutes.", + "name": "AWS DynamoDB - High Account Provisioned Write Capacity", + "description": "This alert fires when we detect that the average write capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/dynamodb eventSource \"dynamodb.amazonaws.com\"\n| json \"eventSource\", \"eventName\", \"requestParameters.tableName\", \"sourceIPAddress\", \"userIdentity.userName\", \"userIdentity.sessionContext.sessionIssuer.userName\" as event_source, event_name, tablename, SourceIp, UserName, ContextUserName nodrop\n| where event_source = \"dynamodb.amazonaws.com\" and event_name = \"DeleteTable\"\n| if (isEmpty(UserName), ContextUserName, UserName) as user\n| count by _messageTime, account, region, namespace, event_name, user, tablename\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, namespace, event_name, user, tablename\n| fields -_messageTime" + "query": "Namespace=aws/dynamodb metric=AccountProvisionedWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 5, + "timeRange": "-5m", + "threshold": 80, "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 5, + "timeRange": "-5m", + "threshold": 80, "thresholdType": "LessThan", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "occurrenceType": "Always", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - High 4XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", + "name": "AWS API Gateway - High Integration Latency", + "description": "This alert fires when we detect that the average integration latency for a given API Gateway is greater than or equal to one second for 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1035,15 +1030,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elb metric=HTTPCode_ELB_4XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along loadbalancername, account, region, namespace" + "query": "Namespace=aws/apigateway metric=IntegrationLatency statistic=Average account=* region=* apiname=* | avg by apiname, namespace, region, account" } ], "triggers": [ @@ -1052,7 +1039,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 1000, "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", "minDataPoints": 2 @@ -1062,22 +1049,25 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 1000, "thresholdType": "LessThan", "occurrenceType": "Always", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Max Provisioned Table Read Capacity", - "description": "This alert fires when we detect that the average percentage of read provisioned capacity used by the highest read provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "AWS Classic Load Balancer - High Latency", + "description": "This alert fires when we detect that the average latency for a given Classic load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1087,45 +1077,44 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "Namespace=aws/elb metric=Latency Statistic=Average account=* region=* loadbalancername=* | eval(_value*1000) | sum by account, region, namespace, loadbalancername" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 3000, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 3000, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High Redis Database Memory Usage", - "description": "This alert fires when the average database memory usage within a 5 minute interval for the Redis engine is high (>=95%). When the value reaches 100%, eviction may happen or write operations may fail based on ElastiCache policies thereby impacting application performance.", + "name": "AWS Application Load Balancer - High Latency", + "description": "This alert fires when we detect that the average latency for a given Application load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1135,45 +1124,44 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=DatabaseMemoryUsagePercentage statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" + "query": "Namespace=aws/applicationelb metric=TargetResponseTime Statistic=Average account=* region=* loadbalancer=* | eval(_value*1000) | sum by account, region, namespace, loadbalancer" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 95, + "threshold": 3000, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 95, + "threshold": 3000, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon ECS - High CPU Utilization", - "description": "This alert fires when the average CPU utilization within a 5 minute interval for a service within a cluster is high (>=85%).", + "name": "AWS DynamoDB - High Account Provisioned Read Capacity", + "description": "This alert fires when we detect that the average read capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1183,45 +1171,44 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/ecs metric=CPUUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" + "query": "Namespace=aws/dynamodb metric=AccountProvisionedReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 80, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 80, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High Redis Memory Fragmentation Ratio", - "description": "This alert fires when the average Redis memory fragmentation ratio for within a 5 minute interval is high (>=1.5). Value equal to or greater than 1.5 Indicate significant memory fragmentation.", + "name": "AWS Lambda - Low Provisioned Concurrency Utilization", + "description": "This alert fires when the average provisioned concurrency utilization for 5 minutes is low (<= 50%). This indicates low provisioned concurrency utilization efficiency.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1231,45 +1218,44 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=MemoryFragmentationRatio statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" + "query": "Namespace=aws/lambda metric=ProvisionedConcurrencyUtilization statistic=Average account=* region=* functionname=* | avg by functionname, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1.5, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "threshold": 50, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1.5, - "thresholdType": "LessThan", - "field": null, + "threshold": 50, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High CPU Utilization", - "description": "This alert fires when the average CPU utilization within a 5 minute interval for a host is high (>=90%). The CPUUtilization metric includes total CPU utilization across application, operating system and management processes. We highly recommend monitoring CPU utilization for hosts with two vCPUs or less.", + "name": "AWS Network Load Balancer - High Unhealthy Hosts", + "description": "This alert fires when we detect that are there are too many unhealthy hosts (>=10%) within an interval of 5 minutes for a given network load balancer", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1279,45 +1265,52 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=CPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" + "query": "Namespace=aws/NetworkELB metric=UnHealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/NetworkELB metric=HealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / (#A + #B) along LoadBalancer, AvailabilityZone, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, + "threshold": 10, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, + "threshold": 10, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Account Provisioned Read Capacity", - "description": "This alert fires when we detect that the average read capacity provisioned for an account for a time interval of 5 minutes is greater than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "AWS Classic Load Balancer - High 5XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1327,135 +1320,146 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=AccountProvisionedReadCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "Namespace=aws/elb metric=HTTPCode_ELB_5XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along loadbalancername, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, + "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS SQS - Access from highly malicious sources", - "description": "This alert fires when an AWS - SQS resource is accessed from highly malicious IP addresses within last 5 minutes", + "name": "Amazon Elasticache - High Engine CPU Utilization", + "description": "This alert fires when the average CPU utilization for the Redis engine process within a 5 minute interval is high (>=90%). For larger node types with four vCPUs or more, use the EngineCPUUtilization metric to monitor and set thresholds for scaling.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=\"aws/sqs\" eventname eventsource \"sqs.amazonaws.com\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"sourceIPAddress\",\"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, recipient_account_id, requestParameters, responseElements, src_ip, error_code, error_message nodrop\n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, type, arn, username nodrop\n| json field=requestParameters \"queueUrl\" as queueUrlReq nodrop \n| json field=responseElements \"queueUrl\" as queueUrlRes nodrop\n| where event_source=\"sqs.amazonaws.com\" and !(src_ip matches \"*.amazonaws.com\")\n| if(event_name=\"CreateQueue\", queueUrlRes, queueUrlReq) as queueUrl \n| parse regex field=queueUrl \"(?[^\\/]*$)\"\n| if (isBlank(recipient_account_id), accountid, recipient_account_id) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status \n| count as ip_count by src_ip\n| lookup type, actor, raw, threatlevel as malicious_confidence from sumo://threat/cs on threat=src_ip\n| json field=raw \"labels[*].name\" as label_name \n| replace(label_name, \"\\\\/\",\"->\") as label_name\n| replace(label_name, \"\\\"\",\" \") as label_name\n| if (isEmpty(actor), \"Unassigned\", actor) as actor\n| where type=\"ip_address\" and malicious_confidence = \"high\"\n| sort by ip_count, src_ip\n| fields src_ip, malicious_confidence, actor, label_name" + "query": "Namespace=aws/elasticache metric=EngineCPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", - "field": null + "threshold": 90, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", - "field": null + "threshold": 90, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS API Gateway - High Latency", - "description": "This alert fires when we detect that the average latency for a given API Gateway is greater than or equal to one second for 5 minutes.", + "name": "AWS EC2 - High Memory Utilization", + "description": "This alert fires when the average memory utilization within a 5 minute interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "4m", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/apigateway metric=Latency statistic=Average account=* region=* apiname=* | avg by apiname, namespace, region, account" + "query": "Namespace=aws/ec2 metric=Mem_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, + "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 1000, + "threshold": 85, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - High 5XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", + "name": "Amazon ECS - High Memory Utilization", + "description": "This alert fires when the average memory utilization within a 5 minute interval for a service within a cluster is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1465,15 +1469,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elb metric=HTTPCode_ELB_5XX Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/elb metric=RequestCount Statistic=Sum account=* region=* loadbalancername=* | sum by loadbalancername, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / #B along loadbalancername, account, region, namespace" + "query": "Namespace=aws/ecs metric=MemoryUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" } ], "triggers": [ @@ -1482,7 +1478,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 85, "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", "minDataPoints": 2 @@ -1492,66 +1488,82 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 85, "thresholdType": "LessThan", "occurrenceType": "Always", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - Access from Highly Malicious Sources", - "description": "This alert fires when the Classic load balancer is accessed from highly malicious IP addresses within last 5 minutes.", + "name": "AWS Lambda - High Percentage of Failed Requests", + "description": "This alert fires when we detect a large number of failed Lambda requests (>5%) within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", - "evaluationDelay": "0m", + "monitorType": "Metrics", + "evaluationDelay": "4m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/elb\n| parse \"* * * * * * * * * * * \\\"*\\\" \\\"*\\\" * *\" as datetime, loadbalancername, client, backend, request_processing_time, backend_processing_time, response_processing_time, elb_status_code, backend_status_code, received_bytes, sent_bytes, request, user_agent, ssl_cipher, ssl_protocol\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancername, account, region, namespace\n| lookup type, actor, raw, threatlevel as MaliciousConfidence from sumo://threat/cs on threat=ClientIp \n| json field=raw \"labels[*].name\" as LabelName \n| replace(LabelName, \"\\\\/\",\"->\") as LabelName\n| replace(LabelName, \"\\\"\",\" \") as LabelName\n| where type=\"ip_address\" and MaliciousConfidence=\"high\"\n| if (isEmpty(actor), \"Unassigned\", actor) as Actor\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancername, account, region, namespace, MaliciousConfidence, Actor, LabelName" + "query": "Namespace=aws/lambda metric=Errors Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/lambda metric=Invocations Statistic=Sum account=* region=* functionname=* | sum by functionname, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along functionname, account, region, namespace" } ], "triggers": [ { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "GreaterThan", - "field": null + "threshold": 5, + "thresholdType": "GreaterThanOrEqual", + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "LogsStaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 0, - "thresholdType": "LessThanOrEqual", - "field": null + "threshold": 5, + "thresholdType": "LessThan", + "occurrenceType": "Always", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS EC2 - High Total CPU Utilization", - "description": "This alert fires when the average total CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", + "name": "AWS DynamoDB - Multiple Tables deleted", + "description": "This alert fires when five or more tables are deleted within 15 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", + "monitorType": "Logs", "evaluationDelay": "0m", "alertName": null, "runAs": null, @@ -1559,93 +1571,87 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/ec2 metric=CPU_Total account=* region=* instanceid=* | avg by account, region, namespace, instanceid" + "query": "account=* region=* namespace=aws/dynamodb eventSource \"dynamodb.amazonaws.com\"\n| json \"eventSource\", \"eventName\", \"requestParameters.tableName\", \"sourceIPAddress\", \"userIdentity.userName\", \"userIdentity.sessionContext.sessionIssuer.userName\" as event_source, event_name, tablename, SourceIp, UserName, ContextUserName nodrop\n| where event_source = \"dynamodb.amazonaws.com\" and event_name = \"DeleteTable\"\n| if (isEmpty(UserName), ContextUserName, UserName) as user\n| count by _messageTime, account, region, namespace, event_name, user, tablename\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, namespace, event_name, user, tablename\n| fields -_messageTime" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 85, + "timeRange": "-15m", + "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 85, + "timeRange": "-15m", + "threshold": 5, "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High Write Latency", - "description": "This alert fires when the average write latency of a database within a 5 minute interval is high (>=5 seconds) . High write latencies will affect the performance of your application.", + "name": "Amazon Elasticache - Multiple Failed Operations", + "description": "This alert fires when we detect multiple failed operations within a 15 minute interval for an ElastiCache service.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=WriteLatency statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "account=* region=* namespace=aws/elasticache \"\\\"eventSource\\\":\\\"elasticache.amazonaws.com\\\"\" errorCode errorMessage\n| json \"eventSource\", \"errorCode\", \"errorMessage\", \"userIdentity\", \"requestParameters\", \"responseElements\" as event_source, error_code, error_message, user_identity, requestParameters, responseElements nodrop\n| json field=requestParameters \"cacheClusterId\" as req_cacheClusterId nodrop\n| json field=responseElements \"cacheClusterId\" as res_cacheClusterId nodrop\n| json field=user_identity \"arn\", \"userName\" nodrop \n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop\n| if (isEmpty(userName), user, userName) as user\n| if (isEmpty(req_cacheClusterId), res_cacheClusterId, req_cacheClusterId) as cacheclusterid\n| where event_source matches \"elasticache.amazonaws.com\" and !isEmpty(error_code) and !isEmpty(error_message) and !isEmpty(user)\n| count as event_count by _messageTime, account, region, event_source, error_code, error_message, user, cacheclusterid\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, event_source, error_code, error_message, user, cacheclusterid\n| fields -_messageTime" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 5, + "timeRange": "-15m", + "threshold": 10, "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-5m", - "threshold": 5, + "timeRange": "-15m", + "threshold": 10, "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High Disk Queue Depth", - "description": "This alert fires when the average disk queue depth for a database is high (>=5) for an interval of 5 minutes. Higher this value, higher will be the number of outstanding I/Os (read/write requests) waiting to access the disk, which will impact the performance of your application.", + "name": "Amazon RDS - High CPU Utilization", + "description": "This alert fires when we detect that the average CPU utilization for a database is high (>=85%) for an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1655,141 +1661,134 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=DiskQueueDepth statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/rds metric=CPUUtilization statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 5, + "threshold": 85, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon ECS - High Memory Utilization", - "description": "This alert fires when the average memory utilization within a 5 minute interval for a service within a cluster is high (>=85%).", + "name": "AWS Classic Load Balancer - Access from Highly Malicious Sources", + "description": "This alert fires when the Classic load balancer is accessed from highly malicious IP addresses within last 5 minutes.", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/ecs metric=MemoryUtilization statistic=Average account=* region=* ClusterName=* ServiceName=* | avg by ClusterName, ServiceName, account, region, namespace" + "query": "account=* region=* namespace=aws/elb\n| parse \"* * * * * * * * * * * \\\"*\\\" \\\"*\\\" * *\" as datetime, loadbalancername, client, backend, request_processing_time, backend_processing_time, response_processing_time, elb_status_code, backend_status_code, received_bytes, sent_bytes, request, user_agent, ssl_cipher, ssl_protocol\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancername, account, region, namespace\n| threatlookup singleIndicator ClientIp\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as MaliciousConfidence\n| where MaliciousConfidence=\"high\"\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancername, account, region, namespace, MaliciousConfidence, Actor" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 0, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 0, + "thresholdType": "LessThanOrEqual", + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS DynamoDB - High Max Provisioned Table Write Capacity", - "description": "This alert fires when we detect that the average percentage of write provisioned capacity used by the highest write provisioned table of an account for a time interval of 5 minutes is great than or equal to 80%. High values indicate requests to the database are being throttled, which could further indicate that your application may not be working as intended.", + "name": "AWS SNS - Access from Highly Malicious Sources", + "description": "This alert fires when an Application AWS - SNS is accessed from highly malicious IP addresses within last 5 minutes", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Metrics", - "evaluationDelay": "4m", + "monitorType": "Logs", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/dynamodb metric=MaxProvisionedTableWriteCapacityUtilization statistic=Average account=* region=* | avg by namespace, region, account" + "query": "account=* region=* namespace=aws/sns \"\\\"eventsource\\\":\\\"sns.amazonaws.com\\\"\" sourceIPAddress\n| json \"userIdentity\", \"eventSource\", \"eventName\", \"awsRegion\", \"sourceIPAddress\", \"userAgent\", \"eventType\", \"recipientAccountId\", \"requestParameters\", \"responseElements\", \"requestID\", \"errorCode\", \"errorMessage\" as userIdentity, event_source, event_name, region, src_ip, user_agent, event_type, recipient_account_id, requestParameters, responseElements, request_id, error_code, error_message nodrop\n| where event_source = \"sns.amazonaws.com\" \n| json field=userIdentity \"accountId\", \"type\", \"arn\", \"userName\" as accountid, user_type, arn, username nodrop\n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountid, user nodrop\n| json field=requestParameters \"topicArn\", \"name\", \"resourceArn\", \"subscriptionArn\" as req_topic_arn, req_topic_name, resource_arn, subscription_arn nodrop \n| json field=responseElements \"topicArn\" as res_topic_arn nodrop\n| if (isBlank(req_topic_arn), res_topic_arn, req_topic_arn) as topic_arn\n| if (isBlank(topic_arn), resource_arn, topic_arn) as topic_arn\n| parse field=topic_arn \"arn:aws:sns:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp nodrop\n| parse field=subscription_arn \"arn:aws:sns:*:*:*:*\" as region_temp, accountid_temp, topic_arn_name_temp, arn_value_temp nodrop\n| if (isBlank(req_topic_name), topic_arn_name_temp, req_topic_name) as topicname\n| if (isBlank(accountid), recipient_account_id, accountid) as accountid\n| if (isEmpty(error_code), \"Success\", \"Failure\") as event_status\n| if (isEmpty(username), user_type, username) as user_type\n| count as ip_count by src_ip, event_name, region, accountid,user_type\n| threatlookup singleIndicator src_ip\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as malicious_confidence\n| where malicious_confidence=\"high\"\n| sum(ip_count) as threat_count by src_ip, event_name, region, accountid, malicious_confidence, actor" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, - "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 0, + "thresholdType": "GreaterThan", + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 80, - "thresholdType": "LessThan", - "field": null, - "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", - "minDataPoints": 2 + "threshold": 0, + "thresholdType": "LessThanOrEqual", + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - High 5XX Errors", - "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", + "name": "AWS Application Load Balancer - High 4XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 4xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1799,7 +1798,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_5XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" + "query": "Namespace=aws/applicationelb metric=HTTPCode_ELB_4XX_Count Statistic=Sum account=* region=* loadbalancer=* | sum by loadbalancer, account, region, namespace" }, { "rowId": "B", @@ -1812,42 +1811,41 @@ ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 5, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - Multiple Failed Operations", - "description": "This alert fires when we detect multiple failed operations within a 15 minute interval for an ElastiCache service.", + "name": "AWS EC2 - High System CPU Utilization", + "description": "This alert fires when the average system CPU utilization within a 5 minute interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", - "monitorType": "Logs", + "monitorType": "Metrics", "evaluationDelay": "0m", "alertName": null, "runAs": null, @@ -1855,45 +1853,44 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/elasticache \"\\\"eventSource\\\":\\\"elasticache.amazonaws.com\\\"\" errorCode errorMessage\n| json \"eventSource\", \"errorCode\", \"errorMessage\", \"userIdentity\", \"requestParameters\", \"responseElements\" as event_source, error_code, error_message, user_identity, requestParameters, responseElements nodrop\n| json field=requestParameters \"cacheClusterId\" as req_cacheClusterId nodrop\n| json field=responseElements \"cacheClusterId\" as res_cacheClusterId nodrop\n| json field=user_identity \"arn\", \"userName\" nodrop \n| parse field=arn \":assumed-role/*\" as user nodrop \n| parse field=arn \"arn:aws:iam::*:*\" as accountId, user nodrop\n| if (isEmpty(userName), user, userName) as user\n| if (isEmpty(req_cacheClusterId), res_cacheClusterId, req_cacheClusterId) as cacheclusterid\n| where event_source matches \"elasticache.amazonaws.com\" and !isEmpty(error_code) and !isEmpty(error_message) and !isEmpty(user)\n| count as event_count by _messageTime, account, region, event_source, error_code, error_message, user, cacheclusterid\n| formatDate(_messageTime, \"MM/dd/yyyy HH:mm:ss:SSS Z\") as message_date\n| fields message_date, account, region, event_source, error_code, error_message, user, cacheclusterid\n| fields -_messageTime" + "query": "Namespace=aws/ec2 metric=CPU_Sys account=* region=* instanceid=* | avg by account, region, namespace, instanceid" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 10, + "timeRange": "-5m", + "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "occurrenceType": "Always", + "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, - "timeRange": "-15m", - "threshold": 10, + "timeRange": "-5m", + "threshold": 85, "thresholdType": "LessThan", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "occurrenceType": "Always", + "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - High CPU Utilization", - "description": "This alert fires when we detect that the average CPU utilization for a database is high (>=85%) for an interval of 5 minutes.", + "name": "AWS API Gateway - High Latency", + "description": "This alert fires when we detect that the average latency for a given API Gateway is greater than or equal to one second for 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1903,45 +1900,44 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=CPUUtilization statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/apigateway metric=Latency statistic=Average account=* region=* apiname=* | avg by apiname, namespace, region, account" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 1000, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 85, + "threshold": 1000, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Classic Load Balancer - High Latency", - "description": "This alert fires when we detect that the average latency for a given Classic load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", + "name": "Amazon Elasticache - High CPU Utilization", + "description": "This alert fires when the average CPU utilization within a 5 minute interval for a host is high (>=90%). The CPUUtilization metric includes total CPU utilization across application, operating system and management processes. We highly recommend monitoring CPU utilization for hosts with two vCPUs or less.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1951,7 +1947,7 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elb metric=Latency Statistic=Average account=* region=* loadbalancername=* | eval(_value*1000) | sum by account, region, namespace, loadbalancername" + "query": "Namespace=aws/elasticache metric=CPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" } ], "triggers": [ @@ -1960,7 +1956,7 @@ "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 90, "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", "minDataPoints": 2 @@ -1970,22 +1966,25 @@ "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 90, "thresholdType": "LessThan", "occurrenceType": "Always", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon RDS - Low Burst Balance", - "description": "This alert fires when we observe a low burst balance (<= 50%) for a given database. A low burst balance indicates you won't be able to scale up as fast for burstable database workloads on gp2 volumes.", + "name": "AWS API Gateway - High 5XX Errors", + "description": "This alert fires where there are too many HTTP requests (>5%) with a response status of 5xx within an interval of 5 minutes.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -1995,45 +1994,52 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/rds metric=BurstBalance statistic=Average account=* region=* dbidentifier=* | avg by dbidentifier, namespace, region, account" + "query": "Namespace=aws/apigateway metric=5xxError Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + }, + { + "rowId": "B", + "query": "Namespace=aws/apigateway metric=count Statistic=Sum account=* region=* apiname=* | sum by apiname, account, region, namespace" + }, + { + "rowId": "C", + "query": "#A * 100 / #B along account, region, namespace" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "LessThanOrEqual", - "field": null, + "threshold": 5, + "thresholdType": "GreaterThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 50, - "thresholdType": "GreaterThan", - "field": null, + "threshold": 5, + "thresholdType": "LessThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "Amazon Elasticache - High Engine CPU Utilization", - "description": "This alert fires when the average CPU utilization for the Redis engine process within a 5 minute interval is high (>=90%). For larger node types with four vCPUs or more, use the EngineCPUUtilization metric to monitor and set thresholds for scaling.", + "name": "Amazon Elasticache - Low Redis Cache Hit Rate", + "description": "This alert fires when the average cache hit rate for Redis within a 5 minute interval is low (<= 80%). This indicates low efficiency of the Redis instance. If cache ratio is lower than 80%, that indicates a significant amount of keys are either evicted, expired, or don't exist.", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2043,45 +2049,44 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/elasticache metric=EngineCPUUtilization statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by CacheClusterId, CacheNodeId, account, region, namespace" + "query": "Namespace=aws/elasticache metric=CacheHitRate statistic=Average account=* region=* CacheClusterId=* CacheNodeId=* | avg by account, region, namespace, CacheClusterId, CacheNodeId" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "threshold": 80, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 90, - "thresholdType": "LessThan", - "field": null, + "threshold": 80, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Network Load Balancer - High Unhealthy Hosts", - "description": "This alert fires when we detect that are there are too many unhealthy hosts (>=10%) within an interval of 5 minutes for a given network load balancer", + "name": "AWS SQS - Message processing not fast enough", + "description": "This alert fires when we detect message processing is not fast enough. That is, the average approximate age of the oldest non-deleted message in the queue is more than 5 seconds for an interval of 5 minutes", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", "evaluationDelay": "4m", @@ -2091,49 +2096,40 @@ "queries": [ { "rowId": "A", - "query": "Namespace=aws/NetworkELB metric=UnHealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" - }, - { - "rowId": "B", - "query": "Namespace=aws/NetworkELB metric=HealthyHostCount Statistic=sum account=* region=* LoadBalancer=* AvailabilityZone=* | sum by LoadBalancer, AvailabilityZone, account, region, namespace" - }, - { - "rowId": "C", - "query": "#A * 100 / (#A + #B) along LoadBalancer, AvailabilityZone, account, region, namespace" + "query": "metric=ApproximateAgeOfOldestMessage Statistic=avg region=* account=* queuename=* namespace=aws/sqs | avg by account,region,namespace,queuename " } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "GreaterThanOrEqual", - "field": null, + "threshold": 5, + "thresholdType": "GreaterThan", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 10, - "thresholdType": "LessThan", - "field": null, + "threshold": 5, + "thresholdType": "LessThanOrEqual", "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { "name": "AWS Application Load Balancer - Access from Highly Malicious Sources", @@ -2147,89 +2143,85 @@ "queries": [ { "rowId": "A", - "query": "account=* region=* namespace=aws/applicationelb\n| parse \"* * * * * * * * * * * * \\\"*\\\" \\\"*\\\" * * * \\\"*\\\"\" as Type, DateTime, loadbalancer, Client, Target, RequestProcessingTime, TargetProcessingTime, ResponseProcessingTime, ElbStatusCode, TargetStatusCode, ReceivedBytes, SentBytes, Request, UserAgent, SslCipher, SslProtocol, TargetGroupArn, TraceId\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancer, account, region, namespace\n| lookup type, actor, raw, threatlevel as MaliciousConfidence from sumo://threat/cs on threat=ClientIp \n| json field=raw \"labels[*].name\" as LabelName \n| replace(LabelName, \"\\\\/\",\"->\") as LabelName\n| replace(LabelName, \"\\\"\",\" \") as LabelName\n| where type=\"ip_address\" and MaliciousConfidence=\"high\"\n| if (isEmpty(actor), \"Unassigned\", actor) as Actor\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancer, account, region, namespace, MaliciousConfidence, Actor, LabelName" + "query": "account=* region=* namespace=aws/applicationelb\n| parse \"* * * * * * * * * * * * \\\"*\\\" \\\"*\\\" * * * \\\"*\\\"\" as Type, DateTime, loadbalancer, Client, Target, RequestProcessingTime, TargetProcessingTime, ResponseProcessingTime, ElbStatusCode, TargetStatusCode, ReceivedBytes, SentBytes, Request, UserAgent, SslCipher, SslProtocol, TargetGroupArn, TraceId\n| parse regex \"(?\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\" multi\n| where ClientIp != \"0.0.0.0\" and ClientIp != \"127.0.0.1\"\n| count as ip_count by ClientIp, loadbalancer, account, region, namespace\n| threatlookup singleIndicator ClientIp\n| where (_threatlookup.type=\"ipv4-addr:value\" or _threatlookup.type=\"ipv6-addr:value\") and !isNull(_threatlookup.confidence)\n| if (isEmpty(_threatlookup.actors), \"Unassigned\", _threatlookup.actors) as actor\n| if (_threatlookup.confidence >= 85, \"high\", if (_threatlookup.confidence >= 50, \"medium\", if (_threatlookup.confidence >= 15, \"low\", if (_threatlookup.confidence >= 0, \"unverified\", \"Unknown\")))) as MaliciousConfidence\n| where MaliciousConfidence=\"high\"\n| sum (ip_count) as ThreatCount by ClientIp, loadbalancer, account, region, namespace, MaliciousConfidence, Actor" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 0, "thresholdType": "GreaterThan", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "field": null }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "LogsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", "threshold": 0, "thresholdType": "LessThanOrEqual", - "field": null, - "occurrenceType": "ResultCount", - "triggerSource": "AllResults", - "minDataPoints": null + "field": null } ], + "timeZone": null, "notifications": [], "isDisabled": true, "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] }, { - "name": "AWS Application Load Balancer - High Latency", - "description": "This alert fires when we detect that the average latency for a given Application load balancer within a time interval of 5 minutes is greater than or equal to three seconds.", + "name": "AWS EC2 - High Disk Utilization", + "description": "This alert fires when the average disk utilization within a 5 minute time interval for an EC2 instance is high (>=85%).", "type": "MonitorsLibraryMonitorExport", "monitorType": "Metrics", - "evaluationDelay": "4m", + "evaluationDelay": "0m", "alertName": null, "runAs": null, "notificationGroupFields": [], "queries": [ { "rowId": "A", - "query": "Namespace=aws/applicationelb metric=TargetResponseTime Statistic=Average account=* region=* loadbalancer=* | eval(_value*1000) | sum by account, region, namespace, loadbalancer" + "query": "Namespace=aws/ec2 metric=Disk_UsedPercent account=* region=* instanceid=* | avg by account, region, namespace, instanceid, devname" } ], "triggers": [ { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "Critical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 85, "thresholdType": "GreaterThanOrEqual", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 }, { - "detectionMethod": "StaticCondition", + "detectionMethod": "MetricsStaticCondition", "triggerType": "ResolvedCritical", "resolutionWindow": null, "timeRange": "-5m", - "threshold": 3000, + "threshold": 85, "thresholdType": "LessThan", - "field": null, "occurrenceType": "Always", - "triggerSource": "AnyTimeSeries", "minDataPoints": 2 } ], + "timeZone": null, "notifications": [], "isDisabled": true, - "groupNotifications": false, + "groupNotifications": true, "playbook": "", "sloId": null, - "monitorTemplateId": null + "monitorTemplateId": null, + "tags": null, + "automatedPlaybookIds": [] } ] } \ No newline at end of file