diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index b08854adb..e6ac352a6 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -170,6 +170,11 @@ jobs: env: TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} + - name: Get timestamp in milliseconds before rebuild + id: timestamp + run: | + echo "TIME_BEFORE_REBUILD=$(date +%s%N | cut -b1-13)" >> $GITHUB_OUTPUT + # - name: Build environment-specific compute image # id: packer_build # run: | @@ -209,6 +214,14 @@ jobs: . environments/.stackhpc/activate ansible-playbook -vv ansible/ci/check_grafana.yml + - name: Check Loki Slurm logs persisted through rebuild + run: | + . venv/bin/activate + . environments/.stackhpc/activate + ansible-playbook -vv ansible/ci/check_loki.yml -e "end_timestamp=${{ steps.timestamp.outputs.TIME_BEFORE_REBUILD }} testuser_password=$TESTUSER_PASSWORD" + env: + TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} + - name: Delete infrastructure run: | . venv/bin/activate diff --git a/ansible/ci/check_loki.yml b/ansible/ci/check_loki.yml new file mode 100644 index 000000000..05bcdc86d --- /dev/null +++ b/ansible/ci/check_loki.yml @@ -0,0 +1,42 @@ +- hosts: control + gather_facts: no + become: no + tasks: + - name: Get Loki datasource uid + ansible.builtin.uri: + url: http://localhost:{{ grafana_port }}/api/datasources/name/Loki + url_username: 'testuser' + url_password: "{{ testuser_password }}" + follow_redirects: all + register: uid_response + + - name: Get Loki logs from before rebuild + vars: + testuser_password: mypassword + ansible.builtin.uri: + url: http://localhost:{{ grafana_port }}/api/ds/query + follow_redirects: all + url_username: testuser + url_password: "{{ testuser_password }}" + method: POST + body_format: json + headers: + Accept: application/json + Content-Type: application/json + # Queries from 20 mins before timestamp to timestamp + body: | + { + "queries":[ + { + "expr":"{unit=\"slurmd.service\"} |= ``", + "datasource":{"uid":"{{ uid_response.json.uid }}"}, + "format":"time_series" + }], + "from":"{{ end_timestamp | int - 1200000 }}", + "to":"{{ end_timestamp }}" + } + register: log_query_content + + - name: Check that logs exist + ansible.builtin.assert: + that: log_query_content.json.results.A.frames[0].data['values'][2] | length > 0 diff --git a/ansible/roles/grafana-dashboards/files/loki-pod-logs-dashboard.json b/ansible/roles/grafana-dashboards/files/loki-pod-logs-dashboard.json new file mode 100644 index 000000000..7b594079a --- /dev/null +++ b/ansible/roles/grafana-dashboards/files/loki-pod-logs-dashboard.json @@ -0,0 +1,228 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Search pod logs stored in Loki", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 31, + "links": [], + "panels": [ + { + "datasource": "Loki", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "hidden", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": "Loki", + "expr": "sum(count_over_time({namespace=\"$namespace\", pod=~\"$pod\"} |~ \"$search\"[$__interval]))", + "refId": "A" + } + ], + "type": "timeseries" + }, + { + "datasource": "Loki", + "gridPos": { + "h": 25, + "w": 24, + "x": 0, + "y": 3 + }, + "id": 2, + "maxDataPoints": "", + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": "Loki", + "expr": "{namespace=\"$namespace\", pod=~\"$pod\"} |~ \"$search\"", + "refId": "A" + } + ], + "title": "Logs Panel", + "type": "logs" + } + ], + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "allValue": ".+", + "current": { + "selected": true, + "text": "ingress-nginx", + "value": "ingress-nginx" + }, + "datasource": "Loki", + "definition": "label_values(namespace)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "namespace", + "options": [], + "query": "label_values(namespace)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "Loki", + "definition": "label_values({namespace=~\"$namespace\"}, pod)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "pod", + "options": [], + "query": "label_values({namespace=~\"$namespace\"}, pod)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": {}, + "hide": 0, + "name": "search", + "options": [], + "query": "", + "skipUrlSync": false, + "type": "textbox" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Loki / Pod Logs", + "uid": "209fd89b771c318dd442225414a50b59", + "version": 1, + "weekStart": "" +} diff --git a/ansible/roles/grafana-dashboards/files/loki-systemd-logs-dashboard.json b/ansible/roles/grafana-dashboards/files/loki-systemd-logs-dashboard.json new file mode 100644 index 000000000..9b19c358c --- /dev/null +++ b/ansible/roles/grafana-dashboards/files/loki-systemd-logs-dashboard.json @@ -0,0 +1,232 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Search systemd logs stored in Loki", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 33, + "links": [], + "panels": [ + { + "datasource": "Loki", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "hidden", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": "Loki", + "expr": "sum(count_over_time({unit=~\"$unit\", hostname=~\"$hostname\"} |~ \"$search\"[$__interval]))", + "refId": "A" + } + ], + "type": "timeseries" + }, + { + "datasource": "Loki", + "gridPos": { + "h": 25, + "w": 24, + "x": 0, + "y": 3 + }, + "id": 2, + "maxDataPoints": "", + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": "Loki", + "expr": "{unit=~\"$unit\", hostname=~\"$hostname\"} |~ \"$search\"", + "refId": "A" + } + ], + "title": "Logs Panel", + "type": "logs" + } + ], + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "allValue": ".+", + "current": { + "selected": true, + "text": "ansible-init.service", + "value": "ansible-init.service" + }, + "datasource": "Loki", + "definition": "label_values(unit)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "unit", + "options": [], + "query": "label_values(unit)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "Loki", + "definition": "label_values({unit=~\"$unit\"}, hostname)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "hostname", + "options": [], + "query": "label_values({unit=~\"$unit\"}, hostname)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": {}, + "hide": 0, + "name": "search", + "options": [], + "query": "", + "skipUrlSync": false, + "type": "textbox" + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Loki / Systemd Logs", + "uid": "fa1bd43aed803111be9cc923cada9811", + "version": 1, + "weekStart": "" + } diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index e9c04632f..9b5579c54 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -193,3 +193,58 @@ kube_prometheus_stack_release_values: >- kube_prometheus_stack_release_defaults | combine(kube_prometheus_stack_release_overrides, recursive = True) }} + +kube_prometheus_stack_loki_release_values: + test_pod: + image: "bats/bats:{{ kube_prometheus_stack_loki_test_pod_tag }}" + loki: + config: + compactor: + retention_enabled: true + limits_config: + retention_period: "{{ kube_prometheus_stack_loki_retention_period }}" + persistence: + enabled: true + selector: + matchLabels: + app.kubernetes.io/name: loki-dir + size: "{{ kube_prometheus_stack_loki_persistence_size }}" + nodeSelector: + clusterrole: server + image: + tag: "{{ kube_prometheus_stack_loki_image_tag }}" + grafana: + sidecar: + datasources: + enabled: false + promtail: + config: + snippets: + extraScrapeConfigs: | + - job_name: journal + journal: + path: /var/log/journal + max_age: 12h + labels: + job: systemd-journal + relabel_configs: + - source_labels: ['__journal__systemd_unit'] + target_label: 'unit' + - source_labels: ['__journal__hostname'] + target_label: 'hostname' + - source_labels: ['__journal_priority_keyword'] + target_label: level + extraVolumes: + - name: journal + hostPath: + path: /var/log/journal + - name: machine-id + hostPath: + path: /etc/machine-id + extraVolumeMounts: + - name: journal + mountPath: /var/log/journal + readOnly: true + - name: machine-id + mountPath: /etc/machine-id + readOnly: true diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/install.yml b/ansible/roles/kube_prometheus_stack/defaults/main/install.yml index b1fcdc7c1..d4088751f 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/install.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/install.yml @@ -9,3 +9,6 @@ image_list: - { name: "quay.io/kiwigrid/k8s-sidecar", tag: "{{ grafana_sidecar_image_tag }}" } - { name: "registry.k8s.io/kube-state-metrics/kube-state-metrics", tag: "{{ kube_prometheus_stack_metrics_image_tag }}" } - { name: "registry.k8s.io/ingress-nginx/kube-webhook-certgen", tag: "{{ kube_prometheus_stack_patch_image_tag }}" } +- { name: "docker.io/grafana/loki", tag: "{{ kube_prometheus_stack_loki_image_tag }}" } +- { name: "docker.io/grafana/promtail", tag: "{{ kube_prometheus_stack_loki_image_tag }}" } +- { name: "docker.io/bats/bats", tag: "{{ kube_prometheus_stack_loki_test_pod_tag }}" } diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index bf6097089..d428a8672 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -20,6 +20,14 @@ kube_prometheus_stack_wait_timeout: 5m kube_prometheus_stack_metrics_image_tag: v2.12.0 kube_prometheus_stack_patch_image_tag: v20221220-controller-v1.5.1-58-g787ea74b6 +kube_prometheus_stack_loki_chart_version: 2.10.2 +kube_prometheus_stack_loki_image_tag: 2.9.3 # also promtail tag +kube_prometheus_stack_loki_test_pod_tag: 1.8.2 + +kube_prometheus_stack_loki_data_dir: "{{ appliances_state_dir }}/loki" +kube_prometheus_stack_loki_persistence_size: "10Gi" +kube_prometheus_stack_loki_retention_period: "72h" + control_ip: "{{ hostvars[groups['control'].0].ansible_host }}" grafana_auth_anonymous: false diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index 13488de58..0a55e5edd 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -82,6 +82,46 @@ path: "{{ prometheus_db_dir }}" type: DirectoryOrCreate +# Loki image seems to be hardcoded to use this uid/gid even when changed in +# podSecurityContext +- name: Create Loki group + ansible.builtin.group: + name: loki + gid: 10001 + +- name: Create Loki user + ansible.builtin.user: + name: loki + uid: 10001 + group: loki + +- name: Create Loki data directory + ansible.builtin.file: + state: directory + path: "{{ kube_prometheus_stack_loki_data_dir }}" + owner: 10001 + group: 10001 + mode: '775' + +- name: Create Loki hostPath volume in /var/lib/state + kubernetes.core.k8s: + namespace: "{{ kube_prometheus_stack_release_namespace }}" + definition: + apiVersion: v1 + kind: PersistentVolume + metadata: + name: loki-dir + labels: + app.kubernetes.io/name: loki-dir + spec: + capacity: + storage: "{{ kube_prometheus_stack_loki_persistence_size }}" # not enforced but may be internally by loki? + accessModes: + - ReadWriteOnce + hostPath: + path: "{{ kube_prometheus_stack_loki_data_dir }}" + type: Directory + - name: Setting up k3s services for OnDemand Exporter when: groups['openondemand'] | count > 0 block: @@ -176,6 +216,16 @@ ansible.builtin.import_role: name: grafana-dashboards +- name: Install loki stack helm chart + kubernetes.core.helm: + chart_ref: loki-stack + chart_repo_url: https://grafana.github.io/helm-charts + chart_version: "{{ kube_prometheus_stack_loki_chart_version }}" + release_name: loki + release_namespace: "{{ kube_prometheus_stack_release_namespace }}" + release_values: "{{ kube_prometheus_stack_loki_release_values }}" + wait: yes + - name: Install kube-prometheus-stack on target Kubernetes cluster kubernetes.core.helm: chart_ref: "{{ kube_prometheus_stack_chart_name }}" diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 6b440865e..5d5caf729 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,7 +1,7 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241029-0905-f23c2fca", - "RL9": "openhpc-RL9-241029-0949-f23c2fca", - "RL9-cuda": "openhpc-cuda-RL9-241029-0905-f23c2fca" + "RL8": "openhpc-RL8-241106-1719-0780a372", + "RL9": "openhpc-RL9-241106-1719-0780a372", + "RL9-cuda": "openhpc-cuda-RL9-241107-0924-0780a372" } } diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index ee874d2ed..7da47e113 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -38,6 +38,10 @@ grafana_dashboards_default: - placeholder: DS_PROMETHEUS replacement: prometheus revision_id: 3 + - dashboard_file: loki-pod-logs-dashboard.json + replacements: [] + - dashboard_file: loki-systemd-logs-dashboard.json + replacements: [] grafana_dashboards: "{{ grafana_dashboards_default + (openondemand_dashboard if groups.get('openondemand') else []) }}" # Configmap names of kube prometheus stack's default dashboards to exclude @@ -73,7 +77,13 @@ grafana_datasources: version: '7.10.2' flavor: elasticsearch editable: true - # readOnly: false + readOnly: false + - name: Loki + url: http://loki:3100 + type: loki + access: proxy + version: 1 + isDefault: false grafana_plugins: - grafana-opensearch-datasource 2.8.1 diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index fe922c78e..42e578054 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -125,9 +125,9 @@ variable "volume_size" { type = map(number) default = { # fat image builds, GB: - rocky-latest = 15 + rocky-latest = 20 rocky-latest-cuda = 30 - openhpc = 15 + openhpc = 20 openhpc-cuda = 30 } }