Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added loki with host logs forwarded #468

Open
wants to merge 11 commits into
base: feature/k3s-monitoring
Choose a base branch
from
13 changes: 13 additions & 0 deletions .github/workflows/stackhpc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,11 @@ jobs:
env:
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}

- name: Get timestamp in milliseconds before rebuild
id: timestamp
run: |
echo "TIME_BEFORE_REBUILD=$(date +%s%N | cut -b1-13)" >> $GITHUB_OUTPUT

# - name: Build environment-specific compute image
# id: packer_build
# run: |
Expand Down Expand Up @@ -209,6 +214,14 @@ jobs:
. environments/.stackhpc/activate
ansible-playbook -vv ansible/ci/check_grafana.yml

- name: Check Loki Slurm logs persisted through rebuild
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -vv ansible/ci/check_loki.yml -e "end_timestamp=${{ steps.timestamp.outputs.TIME_BEFORE_REBUILD }} testuser_password=$TESTUSER_PASSWORD"
env:
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}

- name: Delete infrastructure
run: |
. venv/bin/activate
Expand Down
42 changes: 42 additions & 0 deletions ansible/ci/check_loki.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
- hosts: control
gather_facts: no
become: no
tasks:
- name: Get Loki datasource uid
ansible.builtin.uri:
url: http://localhost:{{ grafana_port }}/api/datasources/name/Loki
url_username: 'testuser'
url_password: "{{ testuser_password }}"
follow_redirects: all
register: uid_response

- name: Get Loki logs from before rebuild
vars:
testuser_password: mypassword
ansible.builtin.uri:
url: http://localhost:{{ grafana_port }}/api/ds/query
follow_redirects: all
url_username: testuser
url_password: "{{ testuser_password }}"
method: POST
body_format: json
headers:
Accept: application/json
Content-Type: application/json
# Queries from 20 mins before timestamp to timestamp
body: |
{
"queries":[
{
"expr":"{unit=\"slurmd.service\"} |= ``",
"datasource":{"uid":"{{ uid_response.json.uid }}"},
"format":"time_series"
}],
"from":"{{ end_timestamp | int - 1200000 }}",
"to":"{{ end_timestamp }}"
}
register: log_query_content

- name: Check that logs exist
ansible.builtin.assert:
that: log_query_content.json.results.A.frames[0].data['values'][2] | length > 0
228 changes: 228 additions & 0 deletions ansible/roles/grafana-dashboards/files/loki-pod-logs-dashboard.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Search pod logs stored in Loki",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 31,
"links": [],
"panels": [
{
"datasource": "Loki",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "hidden",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "bars",
"fillOpacity": 100,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 24,
"x": 0,
"y": 0
},
"id": 6,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": "Loki",
"expr": "sum(count_over_time({namespace=\"$namespace\", pod=~\"$pod\"} |~ \"$search\"[$__interval]))",
"refId": "A"
}
],
"type": "timeseries"
},
{
"datasource": "Loki",
"gridPos": {
"h": 25,
"w": 24,
"x": 0,
"y": 3
},
"id": 2,
"maxDataPoints": "",
"options": {
"dedupStrategy": "none",
"enableLogDetails": true,
"prettifyLogMessage": false,
"showCommonLabels": false,
"showLabels": false,
"showTime": true,
"sortOrder": "Descending",
"wrapLogMessage": true
},
"targets": [
{
"datasource": "Loki",
"expr": "{namespace=\"$namespace\", pod=~\"$pod\"} |~ \"$search\"",
"refId": "A"
}
],
"title": "Logs Panel",
"type": "logs"
}
],
"schemaVersion": 39,
"tags": [],
"templating": {
"list": [
{
"allValue": ".+",
"current": {
"selected": true,
"text": "ingress-nginx",
"value": "ingress-nginx"
},
"datasource": "Loki",
"definition": "label_values(namespace)",
"hide": 0,
"includeAll": false,
"multi": false,
"name": "namespace",
"options": [],
"query": "label_values(namespace)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": ".+",
"current": {
"selected": false,
"text": "All",
"value": "$__all"
},
"datasource": "Loki",
"definition": "label_values({namespace=~\"$namespace\"}, pod)",
"hide": 0,
"includeAll": true,
"multi": true,
"name": "pod",
"options": [],
"query": "label_values({namespace=~\"$namespace\"}, pod)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"current": {},
"hide": 0,
"name": "search",
"options": [],
"query": "",
"skipUrlSync": false,
"type": "textbox"
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
]
},
"timezone": "",
"title": "Loki / Pod Logs",
"uid": "209fd89b771c318dd442225414a50b59",
"version": 1,
"weekStart": ""
}
Loading
Loading