diff --git a/alertmanager/config.yml b/alertmanager/config.yml new file mode 100644 index 00000000..456cae3f --- /dev/null +++ b/alertmanager/config.yml @@ -0,0 +1,13 @@ +global: + slack_api_url: 'https://hooks.slack.com/services/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' + +route: + receiver: 'slack-notifications' + repeat_interval: 2m +receivers: + - name: 'slack-notifications' + slack_configs: + - channel: '#_monitoring' + send_resolved: true + title: "{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}" + text: "{{ range .Alerts }}{{ .Annotations.description }}\n{{ end }}" \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 27a5814e..f5233f2c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -64,6 +64,8 @@ services: - ./prometheus/data:/prometheus command: - '--config.file=/etc/prometheus/prometheus.yml' + links: + - alertmanager:alertmanager depends_on: - springboot @@ -78,3 +80,15 @@ services: depends_on: - prometheus - springboot + + alertmanager: + image: prom/alertmanager + container_name: alert-manager + ports: + - 9093:9093 + volumes: + - ./alertmanager/:/etc/alertmanager/ + restart: always + command: + - "--config.file=/etc/alertmanager/config.yml" + - "--storage.path=/alertmanager" \ No newline at end of file diff --git a/prometheus/alert.rules b/prometheus/alert.rules new file mode 100644 index 00000000..6665bb57 --- /dev/null +++ b/prometheus/alert.rules @@ -0,0 +1,48 @@ +groups: +- name: alert.rules + rules: + - alert: InstanceDown + expr: up == 0 + for: 1m + labels: + severity: "critical" + annotations: + summary: "Endpoint {{ $labels.instance }}" + identifier: "{{ $labels.instance }}" + description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes." + + - alert: HostOutOfMemory + expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 + for: 2m + labels: + severity: warning + annotations: + summary: "Host out of memory (instance {{ $labels.instance }})" + description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostMemoryUnderMemoryPressure + expr: rate(node_vmstat_pgmajfault[1m]) > 1000 + for: 2m + labels: + severity: warning + annotations: + summary: "Host memory under memory pressure (instance {{ $labels.instance }})" + description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostOutOfDiskSpace + expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 + for: 2m + labels: + severity: warning + annotations: + summary: "Host out of disk space (instance {{ $labels.instance }})" + description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostHighCpuLoad + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80 + for: 0m + labels: + severity: warning + annotations: + summary: "Host high CPU load (instance {{ $labels.instance }})" + description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" \ No newline at end of file diff --git a/prometheus/prometheus.yml b/prometheus/prometheus.yml index f2d93e62..785b97ee 100644 --- a/prometheus/prometheus.yml +++ b/prometheus/prometheus.yml @@ -4,10 +4,12 @@ global: alerting: alertmanagers: + - scheme: http - static_configs: - - targets: + - targets: ["alertmanager:9093"] rule_files: + - 'alert.rules' scrape_configs: - job_name: "prometheus"