techeer-sv · baekhangyeol · Nov 9, 2023 · Nov 6, 2023 · Nov 7, 2023 · Nov 7, 2023
diff --git a/alertmanager/config.yml b/alertmanager/config.yml
@@ -0,0 +1,13 @@
+global:
+  slack_api_url: 'https://hooks.slack.com/services/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'
+
+route:
+  receiver: 'slack-notifications'
+  repeat_interval: 2m
+receivers:
+  - name: 'slack-notifications'
+    slack_configs:
+      - channel: '#_monitoring'
+        send_resolved: true
+        title: "{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}"
+        text: "{{ range .Alerts }}{{ .Annotations.description }}\n{{ end }}"
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -64,6 +64,8 @@ services:
       - ./prometheus/data:/prometheus
     command:
       - '--config.file=/etc/prometheus/prometheus.yml'
+    links:
+      - alertmanager:alertmanager
     depends_on:
       - springboot
 
@@ -78,3 +80,15 @@ services:
     depends_on:
       - prometheus
       - springboot
+
+  alertmanager:
+    image: prom/alertmanager
+    container_name: alert-manager
+    ports:
+      - 9093:9093
+    volumes:
+      - ./alertmanager/:/etc/alertmanager/
+    restart: always
+    command:
+      - "--config.file=/etc/alertmanager/config.yml"
+      - "--storage.path=/alertmanager"
diff --git a/prometheus/alert.rules b/prometheus/alert.rules
@@ -0,0 +1,48 @@
+groups:
+- name: alert.rules
+  rules:
+  - alert: InstanceDown
+    expr: up == 0
+    for: 1m
+    labels:
+      severity: "critical"
+    annotations:
+      summary: "Endpoint {{ $labels.instance }}"
+      identifier: "{{ $labels.instance }}"
+      description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
+
+  - alert: HostOutOfMemory
+    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Host out of memory (instance {{ $labels.instance }})"
+      description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+  - alert: HostMemoryUnderMemoryPressure
+    expr: rate(node_vmstat_pgmajfault[1m]) > 1000
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Host memory under memory pressure (instance {{ $labels.instance }})"
+      description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+  - alert: HostOutOfDiskSpace
+    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Host out of disk space (instance {{ $labels.instance }})"
+      description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+
+  - alert: HostHighCpuLoad
+    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Host high CPU load (instance {{ $labels.instance }})"
+      description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
diff --git a/prometheus/prometheus.yml b/prometheus/prometheus.yml
@@ -4,10 +4,12 @@ global:
 
 alerting:
   alertmanagers:
+    - scheme: http
     - static_configs:
-        - targets:
+        - targets: ["alertmanager:9093"]
 
 rule_files:
+  - 'alert.rules'
 
 scrape_configs:
   - job_name: "prometheus"