forked from NVIDIA/NeMo
-
Notifications
You must be signed in to change notification settings - Fork 1
53 lines (50 loc) · 1.46 KB
/
monitor-vms.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# Regularly updates the CI container
name: Reboots VMs in a controlled way
on:
schedule:
- cron: 0/15 * * * *
workflow_dispatch:
jobs:
pre-flight:
runs-on: ubuntu-latest
outputs:
list-of-vms: ${{ steps.main.outputs.main }}
environment: main
steps:
- name: Get list of VMs
id: main
env:
GITHUB_TOKEN: ${{ secrets.PAT }}
run: |
RUNNERS=$(curl -L \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer $GITHUB_TOKEN" \
-H "X-GitHub-Api-Version: 2022-11-28" \
https://api.github.com/repos/NVIDIA/NeMo/actions/runners)
MATRIX=$(echo $RUNNERS \
| jq -c '[
.runners[]
| select(.status == "online")
| select(.name | contains("gpu"))
| {
"vm": .name,
"n_gpus": [
.labels[]
| select(.name | endswith("gpu")) | .name
][0][:1]
}
]
'
)
echo main=$MATRIX | tee -a "$GITHUB_OUTPUT"
maintenance:
needs: pre-flight
strategy:
fail-fast: false
matrix:
include: ${{ fromJSON(needs.pre-flight.outputs.list-of-vms )}}
uses: ./.github/workflows/monitor-single-vm.yml
with:
vm: ${{ matrix.vm }}
n_gpus: ${{ matrix.n_gpus }}
secrets: inherit # pragma: allowlist secret