From 4e6b07943fd9113d9f6909b0fa229f7a62ffcb70 Mon Sep 17 00:00:00 2001 From: Jeremy Fix Date: Wed, 28 Jun 2023 12:33:05 +0200 Subject: [PATCH 01/39] Fix hardcoded slurm username The task hardcodes the slurm username while the variable file does contain a field for this property. --- roles/slurm/tasks/setup-user.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/slurm/tasks/setup-user.yml b/roles/slurm/tasks/setup-user.yml index 710102fad..097d42803 100644 --- a/roles/slurm/tasks/setup-user.yml +++ b/roles/slurm/tasks/setup-user.yml @@ -7,7 +7,7 @@ - name: create slurm user user: - name: slurm + name: "{{ slurm_username }}" state: present system: yes home: "{{ slurm_user_home }}" From ad45ce0e2e0091811742c1de951610a18b813132 Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Wed, 25 Jan 2023 09:45:48 -0800 Subject: [PATCH 02/39] Add version to k8s debug --- scripts/k8s/debug.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/k8s/debug.sh b/scripts/k8s/debug.sh index ba7e69ee6..11306edc2 100755 --- a/scripts/k8s/debug.sh +++ b/scripts/k8s/debug.sh @@ -32,6 +32,7 @@ ansible kube-node -ba "docker info" -vv > ${logdir}/docker-info.log ansible kube-node -ba "cat /etc/docker/daemon.json" -vv > ${logdir}/docker-daemon.log # Kubectl (Generic for any Kubernetes cluster) +kubectl version kubectl get pvc -A > ${logdir}/get-pvc.log kubectl get pv -A > ${logdir}/get-pv.log kubectl get pods -A > ${logdir}/get-pods.log From a9e59c4cfdb086e0ab834aa4c8a2dd5ef52234bf Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Wed, 25 Jan 2023 09:46:22 -0800 Subject: [PATCH 03/39] remove docker runtime tests from multinode jenkins --- workloads/jenkins/Jenkinsfile-multi-nightly | 120 ++------------------ 1 file changed, 7 insertions(+), 113 deletions(-) diff --git a/workloads/jenkins/Jenkinsfile-multi-nightly b/workloads/jenkins/Jenkinsfile-multi-nightly index f630bd9b0..fa0797278 100644 --- a/workloads/jenkins/Jenkinsfile-multi-nightly +++ b/workloads/jenkins/Jenkinsfile-multi-nightly @@ -43,117 +43,6 @@ pipeline { bash -x ./workloads/jenkins/scripts/vagrant-startup.sh ''' - echo "Cluster Up - MGMT Nodes - device plugin + docker" - sh ''' - export DEEPOPS_K8S_OPERATOR=false - export DEEPOPS_K8S_CONTAINER_MANAGER=docker - bash -x ./workloads/jenkins/scripts/test-cluster-up.sh - ''' - - echo "Get K8S Cluster Status" - sh ''' - bash -x ./workloads/jenkins/scripts/get-k8s-debug.sh - ''' - - echo "Verify we can run a GPU job" - sh ''' - timeout 500 bash -x ./workloads/jenkins/scripts/run-gpu-job.sh - ''' - - echo "Verify ingress config" - sh ''' - bash -x ./workloads/jenkins/scripts/verify-ingress-config.sh - ''' - - echo "Verify local docker registry" - sh ''' - bash -x ./workloads/jenkins/scripts/test-local-registry.sh - ''' - - echo "Test running a Deep Learning Example" - sh ''' - timeout 1200 bash -x ./workloads/jenkins/scripts/test-dle-deployment.sh - ''' - - echo "Verify rsyslog forwarding is working for the k8s cluster" - sh ''' - bash -x ./workloads/jenkins/scripts/test-rsyslog-k8s.sh - ''' - - echo "Test Kubeflow installation" - sh ''' - # TODO: timeout 4000 bash -x ./workloads/jenkins/scripts/test-kubeflow.sh - ''' - - echo "Test Monitoring installation" - sh ''' - timeout 1200 bash -x ./workloads/jenkins/scripts/test-monitoring.sh - ''' - - echo "Test Dashboard installation" - sh ''' - timeout 180 bash -x ./workloads/jenkins/scripts/test-dashboard.sh - ''' - - echo "Test Kubeflow pipeline" - sh ''' - # TODO: timeout 1500 bash -x ./workloads/jenkins/scripts/test-kubeflow-pipeline.sh - ''' - - echo "Start new virtual environment" - sh ''' - bash -x ./workloads/jenkins/scripts/vagrant-startup.sh - ''' - - echo "Cluster Up - MGMT Nodes gpu operator + docker" - sh ''' - export DEEPOPS_K8S_OPERATOR=true - export DEEPOPS_K8S_CONTAINER_MANAGER=docker - bash -x ./workloads/jenkins/scripts/test-cluster-up.sh - ''' - - echo "Get K8S Cluster Status" - sh ''' - export DEEPOPS_K8S_OPERATOR=true - bash -x ./workloads/jenkins/scripts/get-k8s-debug.sh - ''' - - echo "Verify we can run a GPU job" - sh ''' - export DEEPOPS_K8S_OPERATOR=true - timeout 500 bash -x ./workloads/jenkins/scripts/run-gpu-job.sh - ''' - - echo "Verify ingress config" - sh ''' - bash -x ./workloads/jenkins/scripts/verify-ingress-config.sh - ''' - - echo "Verify local docker registry" - sh ''' - bash -x ./workloads/jenkins/scripts/test-local-registry.sh - ''' - - echo "Verify rsyslog forwarding is working for the k8s cluster" - sh ''' - bash -x ./workloads/jenkins/scripts/test-rsyslog-k8s.sh - ''' - - echo "Test Monitoring installation" - sh ''' - timeout 1200 bash -x ./workloads/jenkins/scripts/test-monitoring.sh - ''' - - echo "Test Dashboard installation" - sh ''' - timeout 180 bash -x ./workloads/jenkins/scripts/test-dashboard.sh - ''' - - echo "Start new virtual environment" - sh ''' - bash -x ./workloads/jenkins/scripts/vagrant-startup.sh - ''' - echo "Cluster Up - MGMT Nodes gpu operator + containerd" sh ''' export DEEPOPS_K8S_OPERATOR=true @@ -183,6 +72,11 @@ pipeline { echo "unsupported configuration" # TODO bash -x ./workloads/jenkins/scripts/test-local-registry.sh ''' + echo "Test running a Deep Learning Example" + sh ''' + timeout 1200 bash -x ./workloads/jenkins/scripts/test-dle-deployment.sh + ''' + echo "Verify rsyslog forwarding is working for the k8s cluster" sh ''' bash -x ./workloads/jenkins/scripts/test-rsyslog-k8s.sh @@ -213,10 +107,10 @@ pipeline { bash -x ./workloads/jenkins/scripts/vagrant-startup.sh ''' - echo "Cluster Up - MGMT Nodes gpu operator + docker + drivers" + echo "Cluster Up - MGMT Nodes gpu operator + containerd + drivers" sh ''' export DEEPOPS_K8S_OPERATOR_EXISTING_SOFTWARE=true - export DEEPOPS_K8S_CONTAINER_MANAGER=docker + export DEEPOPS_K8S_CONTAINER_MANAGER=containerd bash -x ./workloads/jenkins/scripts/test-cluster-up.sh ''' From c8a21623745a1708439a2df161a008f10d0ef5b6 Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Wed, 25 Jan 2023 09:46:36 -0800 Subject: [PATCH 04/39] Add dle test back to nightly jenkins --- workloads/jenkins/Jenkinsfile-nightly | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/workloads/jenkins/Jenkinsfile-nightly b/workloads/jenkins/Jenkinsfile-nightly index 4bb2a9a21..6f7ba91a4 100644 --- a/workloads/jenkins/Jenkinsfile-nightly +++ b/workloads/jenkins/Jenkinsfile-nightly @@ -72,6 +72,11 @@ pipeline { echo "unsupported configuration" # TODO bash -x ./workloads/jenkins/scripts/test-local-registry.sh ''' + echo "Test running a Deep Learning Example" + sh ''' + timeout 1200 bash -x ./workloads/jenkins/scripts/test-dle-deployment.sh + ''' + echo "Verify rsyslog forwarding is working for the k8s cluster" sh ''' bash -x ./workloads/jenkins/scripts/test-rsyslog-k8s.sh From cc7c2f3582fb2716ab7ca06101ea2594addf6cfe Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Wed, 25 Jan 2023 09:48:52 -0800 Subject: [PATCH 05/39] Test local docker registry even with conatainerd runtime --- workloads/jenkins/Jenkinsfile-multi-nightly | 4 ++-- workloads/jenkins/Jenkinsfile-nightly | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/workloads/jenkins/Jenkinsfile-multi-nightly b/workloads/jenkins/Jenkinsfile-multi-nightly index fa0797278..e1e465b11 100644 --- a/workloads/jenkins/Jenkinsfile-multi-nightly +++ b/workloads/jenkins/Jenkinsfile-multi-nightly @@ -20,7 +20,7 @@ pipeline { steps { // The only difference between the nightly and multi-nightly Jenkinsfiles should be changing GPU quantity from 1 to 2 // TODO: ideally lock should work with declared stages - lock(resource: null, label: 'gpu', quantity: 2, variable: 'GPUDATA') { + lock(resource: null, label: 'gpu', quantity: 1, variable: 'GPUDATA') { echo "Reset repo and unmunge files" sh ''' git reset --hard @@ -69,7 +69,7 @@ pipeline { echo "Verify local docker registry" sh ''' - echo "unsupported configuration" # TODO bash -x ./workloads/jenkins/scripts/test-local-registry.sh + bash -x ./workloads/jenkins/scripts/test-local-registry.sh ''' echo "Test running a Deep Learning Example" diff --git a/workloads/jenkins/Jenkinsfile-nightly b/workloads/jenkins/Jenkinsfile-nightly index 6f7ba91a4..e1e465b11 100644 --- a/workloads/jenkins/Jenkinsfile-nightly +++ b/workloads/jenkins/Jenkinsfile-nightly @@ -69,7 +69,7 @@ pipeline { echo "Verify local docker registry" sh ''' - echo "unsupported configuration" # TODO bash -x ./workloads/jenkins/scripts/test-local-registry.sh + bash -x ./workloads/jenkins/scripts/test-local-registry.sh ''' echo "Test running a Deep Learning Example" From 25895aed3d599511ad779c1d0f8db62636d78e4e Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Wed, 25 Jan 2023 11:34:31 -0800 Subject: [PATCH 06/39] add ansible version to debug --- scripts/k8s/debug.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/k8s/debug.sh b/scripts/k8s/debug.sh index 11306edc2..bdcc9f76f 100755 --- a/scripts/k8s/debug.sh +++ b/scripts/k8s/debug.sh @@ -22,6 +22,7 @@ git branch > ${logdir}/git-branch.log git status > ${logdir}/git-status.log git diff > ${logdir}/git-diff.log git log --pretty=oneline | head -n 20 > ${logdir}/git-log.log +ansible --version # GPU configuration ansible kube-node -ba "nvidia-smi" -vv > ${logdir}/nvidia-smi.log From 2ea2547e446f87d001a433d68e1000daf1f8696f Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Wed, 29 Mar 2023 11:23:20 -0700 Subject: [PATCH 07/39] Bump to latest kubespreay release of 2.21 with bugfixes --- submodules/kubespray | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/submodules/kubespray b/submodules/kubespray index 813576efe..08467ad6b 160000 --- a/submodules/kubespray +++ b/submodules/kubespray @@ -1 +1 @@ -Subproject commit 813576efeb6e32e09b1d11fe9ed7be880dd7df79 +Subproject commit 08467ad6b3bdd5b15c33e3a63d476630766bd04a From fc783044a5175147170bf76dae9c54d4d504ef43 Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Wed, 29 Mar 2023 01:43:24 +0000 Subject: [PATCH 08/39] change containerd_snapshotter default to native, based on GitHub workarounds for CentOS containerd --- config.example/group_vars/k8s-cluster.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/config.example/group_vars/k8s-cluster.yml b/config.example/group_vars/k8s-cluster.yml index 112b0cc26..c3b91b69b 100644 --- a/config.example/group_vars/k8s-cluster.yml +++ b/config.example/group_vars/k8s-cluster.yml @@ -74,6 +74,11 @@ docker_registry_mirrors: "{{ groups['kube-master'] | map('regex_replace', '^(.*) containerd_insecure_registries: "registry.local:31500": "http://registry.local:31500" +# Workaround an issue where kubespray defaults are causing containerd failures +# https://github.com/kubernetes-sigs/cri-tools/issues/436 +# https://github.com/kubernetes-sigs/cri-tools/issues/710 +containerd_snapshotter: "native" + # Work-around for https://github.com/kubernetes-sigs/kubespray/issues/8529 nerdctl_extra_flags: " --insecure-registry" image_command_tool: "crictl" From fbde1207953da87fba35953b158b0cfe3324647d Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Sat, 15 Apr 2023 01:49:47 +0000 Subject: [PATCH 09/39] Comment out/remove support for local insecure containerd registries until bug is fixed --- config.example/group_vars/k8s-cluster.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/config.example/group_vars/k8s-cluster.yml b/config.example/group_vars/k8s-cluster.yml index c3b91b69b..82f3a2eea 100644 --- a/config.example/group_vars/k8s-cluster.yml +++ b/config.example/group_vars/k8s-cluster.yml @@ -69,10 +69,12 @@ docker_insecure_registries: "{{ groups['kube-master']|map('regex_replace', '^(.* crio_insecure_registries: "{{ groups['kube-master']|map('regex_replace', '^(.*)$', '\\1:5000')|list + ['registry.local:31500']}}" docker_registry_mirrors: "{{ groups['kube-master'] | map('regex_replace', '^(.*)$', 'http://\\1:5000') | list }}" -# TODO: Add support in containerd for automatically setting up registry -# mirrors, not just the k8s-local registry -containerd_insecure_registries: - "registry.local:31500": "http://registry.local:31500" +# TODO: The presence of an insecure local containerd registry in K8s v1.24+ seems to be causing an issue, add support for this back when the issue is fixed +# BUG: https://github.com/kubernetes-sigs/kubespray/issues/9956 +## TODO: Add support in containerd for automatically setting up registry +## mirrors, not just the k8s-local registry +#containerd_insecure_registries: +# "registry.local:31500": "http://registry.local:31500" # Workaround an issue where kubespray defaults are causing containerd failures # https://github.com/kubernetes-sigs/cri-tools/issues/436 From b6d24e3fb9f9f7826a44460466e4667b2a1ce427 Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Sat, 15 Apr 2023 03:06:02 +0000 Subject: [PATCH 10/39] Bump metallb from 0.12.1 to 0.13.9 --- scripts/k8s/deploy_loadbalancer.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/k8s/deploy_loadbalancer.sh b/scripts/k8s/deploy_loadbalancer.sh index ba30d45a8..6a85e2dfc 100755 --- a/scripts/k8s/deploy_loadbalancer.sh +++ b/scripts/k8s/deploy_loadbalancer.sh @@ -23,7 +23,7 @@ fi # Add Helm metallb repo if it doesn't exist HELM_CHARTS_REPO_METALLB="${HELM_CHARTS_REPO_METALLB:-https://metallb.github.io/metallb}" -HELM_METALLB_CHART_VERSION=${HELM_METALLB_CHART_VERSION:-0.12.1} +HELM_METALLB_CHART_VERSION=${HELM_METALLB_CHART_VERSION:-0.13.9} if ! helm repo list | grep metallb >/dev/null 2>&1 ; then helm repo add metallb "${HELM_CHARTS_REPO_METALLB}" helm repo update From a34fd68d9e69dd5d46aac4deb821ee61f60cda83 Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Sat, 15 Apr 2023 07:21:09 +0000 Subject: [PATCH 11/39] Update metallb deployment to use new CRD and remove deprecated inlineconfig --- config.example/helm/metallb-resources.yml | 26 +++++++++++++++++++++++ config.example/helm/metallb.yml | 10 --------- scripts/k8s/deploy_loadbalancer.sh | 1 + 3 files changed, 27 insertions(+), 10 deletions(-) create mode 100644 config.example/helm/metallb-resources.yml diff --git a/config.example/helm/metallb-resources.yml b/config.example/helm/metallb-resources.yml new file mode 100644 index 000000000..000288e44 --- /dev/null +++ b/config.example/helm/metallb-resources.yml @@ -0,0 +1,26 @@ +# This was autogenerated by MetalLB's custom resource generator. +apiVersion: metallb.io/v1beta1 +kind: IPAddressPool +metadata: + creationTimestamp: null + name: default + namespace: deepops-loadbalancer +# Default address range matches private network for the virtual cluster +# defined in virtual/. +# You should set this address range based on your site's infrastructure. +spec: + addresses: + - 10.0.0.100-10.0.0.110 +status: {} +--- +apiVersion: metallb.io/v1beta1 +kind: L2Advertisement +metadata: + creationTimestamp: null + name: l2advertisement1 + namespace: deepops-loadbalancer +spec: + ipAddressPools: + - default +status: {} +--- diff --git a/config.example/helm/metallb.yml b/config.example/helm/metallb.yml index 7f0910fdd..03419647f 100644 --- a/config.example/helm/metallb.yml +++ b/config.example/helm/metallb.yml @@ -1,14 +1,4 @@ --- -# Default address range matches private network for the virtual cluster -# defined in virtual/. -# You should set this address range based on your site's infrastructure. -configInline: - address-pools: - - name: default - protocol: layer2 - addresses: - - 10.0.0.100-10.0.0.110 - controller: nodeSelector: node-role.kubernetes.io/master: "" diff --git a/scripts/k8s/deploy_loadbalancer.sh b/scripts/k8s/deploy_loadbalancer.sh index 6a85e2dfc..66932fe7a 100755 --- a/scripts/k8s/deploy_loadbalancer.sh +++ b/scripts/k8s/deploy_loadbalancer.sh @@ -42,4 +42,5 @@ fi if ! helm status metallb -n deepops-loadbalancer >/dev/null 2>&1; then kubectl create namespace deepops-loadbalancer helm install --wait metallb metallb/metallb "${helm_install_args[@]}" --version ${HELM_METALLB_CHART_VERSION} --namespace deepops-loadbalancer + kubectl create -n deepops-loadbalancer -f "${DEEPOPS_CONFIG_DIR}/helm/metallb-resources.yml" fi From 983544a8999806fd40411fe84806ce013b2b81c1 Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Sat, 15 Apr 2023 09:06:55 +0000 Subject: [PATCH 12/39] label metallb ns properly --- scripts/k8s/deploy_loadbalancer.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/k8s/deploy_loadbalancer.sh b/scripts/k8s/deploy_loadbalancer.sh index 66932fe7a..b6fcdb345 100755 --- a/scripts/k8s/deploy_loadbalancer.sh +++ b/scripts/k8s/deploy_loadbalancer.sh @@ -41,6 +41,9 @@ fi # Set up the MetalLB load balancer if ! helm status metallb -n deepops-loadbalancer >/dev/null 2>&1; then kubectl create namespace deepops-loadbalancer + kubectl label namespace deepops-loadbalancer pod-security.kubernetes.io/enforce=privileged + kubectl label namespace deepops-loadbalancer pod-security.kubernetes.io/audit=privileged + kubectl label namespace deepops-loadbalancer pod-security.kubernetes.io/warn=privileged helm install --wait metallb metallb/metallb "${helm_install_args[@]}" --version ${HELM_METALLB_CHART_VERSION} --namespace deepops-loadbalancer kubectl create -n deepops-loadbalancer -f "${DEEPOPS_CONFIG_DIR}/helm/metallb-resources.yml" fi From e65946a2e1e6376ef0f481b34e5f630a18ed3551 Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Thu, 6 Jul 2023 11:37:59 -0700 Subject: [PATCH 13/39] Update Jenkins munge for new metallb config --- workloads/jenkins/scripts/munge-files.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workloads/jenkins/scripts/munge-files.sh b/workloads/jenkins/scripts/munge-files.sh index e179dce12..5c060b86f 100644 --- a/workloads/jenkins/scripts/munge-files.sh +++ b/workloads/jenkins/scripts/munge-files.sh @@ -30,7 +30,7 @@ if [ ${DEEPOPS_FULL_INSTALL} ]; then # mgmt02, mgmt03, gpu02 fi echo "Also fix IPs in the load balancer config" -sed -i -e "s/10\\.0\\.0\\.100-10\\.0\\.0\\.110$/10.0.0.1${GPU01}0-10.0.0.1${GPU01}9/g" config.example/helm/metallb.yml +sed -i -e "s/10\\.0\\.0\\.100-10\\.0\\.0\\.110$/10.0.0.1${GPU01}0-10.0.0.1${GPU01}9/g" config.example/helm/metallb-resources.yml echo "Increase debug scope for ansible-playbook commands" sed -i -e "s/ansible-playbook/ansible-playbook -v/g" virtual/scripts/* From 795e5e96fe0ec506fd582160bbd57a371ec93287 Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Mon, 10 Jul 2023 19:27:53 +0000 Subject: [PATCH 14/39] Update core monitoring/LN services to use control-plane instead of master role --- config.example/helm/metallb.yml | 2 +- config.example/helm/monitoring-no-persist.yml | 8 ++++---- config.example/helm/monitoring.yml | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/config.example/helm/metallb.yml b/config.example/helm/metallb.yml index 03419647f..0af61b983 100644 --- a/config.example/helm/metallb.yml +++ b/config.example/helm/metallb.yml @@ -1,4 +1,4 @@ --- controller: nodeSelector: - node-role.kubernetes.io/master: "" + node-role.kubernetes.io/control-plane: "" diff --git a/config.example/helm/monitoring-no-persist.yml b/config.example/helm/monitoring-no-persist.yml index 685dad8bb..ecab97827 100644 --- a/config.example/helm/monitoring-no-persist.yml +++ b/config.example/helm/monitoring-no-persist.yml @@ -1,6 +1,6 @@ prometheusOperator: nodeSelector: - node-role.kubernetes.io/master: "" + node-role.kubernetes.io/control-plane: "" prometheus: ingress: @@ -27,7 +27,7 @@ prometheus: action: replace target_label: kubernetes_node nodeSelector: - node-role.kubernetes.io/master: "" + node-role.kubernetes.io/control-plane: "" service: type: NodePort nodePort: 30500 @@ -54,7 +54,7 @@ alertmanager: nginx.ingress.kubernetes.io/rewrite-target: / alertmanagerSpec: nodeSelector: - node-role.kubernetes.io/master: "" + node-role.kubernetes.io/control-plane: "" service: type: NodePort nodePort: 30400 @@ -69,7 +69,7 @@ grafana: nginx.ingress.kubernetes.io/ssl-redirect: "false" nginx.ingress.kubernetes.io/rewrite-target: / nodeSelector: - node-role.kubernetes.io/master: "" + node-role.kubernetes.io/control-plane: "" service: type: NodePort nodePort: 30200 diff --git a/config.example/helm/monitoring.yml b/config.example/helm/monitoring.yml index b892eee40..624e59453 100644 --- a/config.example/helm/monitoring.yml +++ b/config.example/helm/monitoring.yml @@ -1,6 +1,6 @@ prometheusOperator: nodeSelector: - node-role.kubernetes.io/master: "" + node-role.kubernetes.io/control-plane: "" prometheus: ingress: @@ -37,7 +37,7 @@ prometheus: requests: storage: 10Gi nodeSelector: - node-role.kubernetes.io/master: "" + node-role.kubernetes.io/control-plane: "" service: type: NodePort nodePort: 30500 @@ -64,7 +64,7 @@ alertmanager: nginx.ingress.kubernetes.io/rewrite-target: / alertmanagerSpec: nodeSelector: - node-role.kubernetes.io/master: "" + node-role.kubernetes.io/control-plane: "" service: type: NodePort nodePort: 30400 @@ -79,7 +79,7 @@ grafana: nginx.ingress.kubernetes.io/ssl-redirect: "false" nginx.ingress.kubernetes.io/rewrite-target: / nodeSelector: - node-role.kubernetes.io/master: "" + node-role.kubernetes.io/control-plane: "" service: type: NodePort nodePort: 30200 From ddf65114b7f9ea59186a5465079f09cc5463c9bf Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Mon, 10 Jul 2023 21:16:18 +0000 Subject: [PATCH 15/39] More comprehensive update of master role -> control-plane --- config.example/group_vars/k8s-cluster.yml | 2 +- docs/k8s-cluster/helm.md | 2 +- playbooks/k8s-cluster.yml | 2 +- .../k8s-internal-container-registry/templates/values.yaml | 2 +- scripts/k8s/deploy_dashboard_user.sh | 2 +- scripts/k8s/deploy_kubeflow.sh | 2 +- scripts/k8s/deploy_monitoring.sh | 2 +- scripts/k8s/deploy_rook.sh | 2 +- workloads/examples/k8s/ingress-loadbalancer.yml | 8 ++++---- workloads/examples/k8s/ingress-nodeport.yml | 8 ++++---- workloads/examples/k8s/services/ambassador-rbac.yml | 2 +- workloads/examples/k8s/services/ingress-controller.yml | 4 ++-- .../examples/k8s/services/logging/es-statefulset.yaml | 2 +- .../examples/k8s/services/logging/kibana-deployment.yaml | 2 +- workloads/examples/k8s/services/node-exporter.yml | 2 +- workloads/examples/k8s/services/prometheus-monitor.yml | 4 ++-- workloads/services/k8s/dgxie/values.yaml | 2 +- 17 files changed, 25 insertions(+), 25 deletions(-) diff --git a/config.example/group_vars/k8s-cluster.yml b/config.example/group_vars/k8s-cluster.yml index 82f3a2eea..65609a6ee 100644 --- a/config.example/group_vars/k8s-cluster.yml +++ b/config.example/group_vars/k8s-cluster.yml @@ -8,7 +8,7 @@ kube_kubeadm_apiserver_extra_args: kubectl_localhost: false kubeconfig_localhost: true helm_enabled: true -tiller_node_selectors: "node-role.kubernetes.io/master=''" +tiller_node_selectors: "node-role.kubernetes.io/control-plane=''" ## Container runtime ## docker for docker, crio for cri-o and containerd for containerd. diff --git a/docs/k8s-cluster/helm.md b/docs/k8s-cluster/helm.md index 7d591d148..37b0e2dca 100644 --- a/docs/k8s-cluster/helm.md +++ b/docs/k8s-cluster/helm.md @@ -21,5 +21,5 @@ If the the value of `helm_enabled` was set to `false` in the `config/kube.yml` f ```bash kubectl create sa tiller --namespace kube-system kubectl create clusterrolebinding tiller --clusterrole cluster-admin --serviceaccount=kube-system:tiller -helm init --service-account tiller --node-selectors node-role.kubernetes.io/master=true +helm init --service-account tiller --node-selectors node-role.kubernetes.io/control-plane=true ``` diff --git a/playbooks/k8s-cluster.yml b/playbooks/k8s-cluster.yml index a6e65532a..8423913aa 100644 --- a/playbooks/k8s-cluster.yml +++ b/playbooks/k8s-cluster.yml @@ -267,7 +267,7 @@ command: "/usr/local/bin/helm repo add 'stable' 'https://charts.helm.sh/stable' --force-update" delegate_to: localhost - name: kubeadm | Remove taint for master with node role - command: "{{ artifacts_dir }}/kubectl --kubeconfig {{ artifacts_dir }}/admin.conf taint node {{ inventory_hostname }} node-role.kubernetes.io/master:NoSchedule-" + command: "{{ artifacts_dir }}/kubectl --kubeconfig {{ artifacts_dir }}/admin.conf taint node {{ inventory_hostname }} node-role.kubernetes.io/control-plane:NoSchedule-" delegate_to: localhost failed_when: false # Taint will not be present if kube-master also under kube-node diff --git a/roles/k8s-internal-container-registry/templates/values.yaml b/roles/k8s-internal-container-registry/templates/values.yaml index 90a8649e3..2981314cd 100644 --- a/roles/k8s-internal-container-registry/templates/values.yaml +++ b/roles/k8s-internal-container-registry/templates/values.yaml @@ -4,7 +4,7 @@ persistence: {% if container_registry_persistence_enabled %}size: "{{ container_registry_storage_size }}"{% endif %} nodeSelector: - node-role.kubernetes.io/master: "" + node-role.kubernetes.io/control-plane: "" service: type: "{{ container_registry_service_type }}" {% if container_registry_service_type == "NodePort" %}nodePort: "{{ container_registry_node_port }}"{% endif %} diff --git a/scripts/k8s/deploy_dashboard_user.sh b/scripts/k8s/deploy_dashboard_user.sh index 7c82685fe..0781eadde 100755 --- a/scripts/k8s/deploy_dashboard_user.sh +++ b/scripts/k8s/deploy_dashboard_user.sh @@ -15,7 +15,7 @@ fi # Get IP of first master dashboard_port=$(kubectl -n kube-system get svc kubernetes-dashboard --no-headers -o custom-columns=PORT:.spec.ports.*.nodePort) -master_ip=$(kubectl get nodes -l node-role.kubernetes.io/master= --no-headers -o custom-columns=IP:.status.addresses.*.address | cut -f1 -d, | head -1) +master_ip=$(kubectl get nodes -l node-role.kubernetes.io/control-plane= --no-headers -o custom-columns=IP:.status.addresses.*.address | cut -f1 -d, | head -1) # Get access token token=$(kubectl -n kube-system describe secret $(kubectl -n kube-system get secret | grep admin-user | awk '{print $1}') | grep ^token: | awk '{print $2}') diff --git a/scripts/k8s/deploy_kubeflow.sh b/scripts/k8s/deploy_kubeflow.sh index e561a4cda..e6591515d 100755 --- a/scripts/k8s/deploy_kubeflow.sh +++ b/scripts/k8s/deploy_kubeflow.sh @@ -221,7 +221,7 @@ function poll_url() { function get_url() { # Get LoadBalancer and NodePorts - master_ip=$(kubectl get nodes -l node-role.kubernetes.io/master= --no-headers -o custom-columns=IP:.status.addresses.*.address | cut -f1 -d, | head -1) + master_ip=$(kubectl get nodes -l node-role.kubernetes.io/control-plane= --no-headers -o custom-columns=IP:.status.addresses.*.address | cut -f1 -d, | head -1) nodePort="$(kubectl get svc -n istio-system istio-ingressgateway --no-headers -o custom-columns=PORT:.spec.ports[?\(@.name==\"http2\"\)].nodePort)" secure_nodePort="$(kubectl get svc -n istio-system istio-ingressgateway --no-headers -o custom-columns=PORT:.spec.ports[?\(@.name==\"https\"\)].nodePort)" lb_ip="$(kubectl get svc -n istio-system istio-ingressgateway --no-headers -o custom-columns=:.status.loadBalancer.ingress[0].ip)" diff --git a/scripts/k8s/deploy_monitoring.sh b/scripts/k8s/deploy_monitoring.sh index 3ee5505b1..d73fdc5f5 100755 --- a/scripts/k8s/deploy_monitoring.sh +++ b/scripts/k8s/deploy_monitoring.sh @@ -192,7 +192,7 @@ function setup_gpu_monitoring() { function get_ips(){ # Get IP information - master_ip=$(kubectl get nodes -l node-role.kubernetes.io/master= --no-headers -o custom-columns=IP:.status.addresses.*.address | cut -f1 -d, | head -1) + master_ip=$(kubectl get nodes -l node-role.kubernetes.io/control-plane= --no-headers -o custom-columns=IP:.status.addresses.*.address | cut -f1 -d, | head -1) ingress_ip_string="$(echo ${master_ip} | tr '.' '-').nip.io" } diff --git a/scripts/k8s/deploy_rook.sh b/scripts/k8s/deploy_rook.sh index 9a746863f..124932893 100755 --- a/scripts/k8s/deploy_rook.sh +++ b/scripts/k8s/deploy_rook.sh @@ -81,7 +81,7 @@ function print_rook() { export rook_toolspod=$(kubectl -n rook-ceph get pod -l app=rook-ceph-tools --no-headers -o custom-columns=:.metadata.name) # Get IP of first master - master_ip=$(kubectl get nodes -l node-role.kubernetes.io/master= --no-headers -o custom-columns=IP:.status.addresses.*.address | cut -f1 -d, | head -1) + master_ip=$(kubectl get nodes -l node-role.kubernetes.io/control-plane= --no-headers -o custom-columns=IP:.status.addresses.*.address | cut -f1 -d, | head -1) # Get Ceph dashboard port dash_port=$(kubectl -n rook-ceph get svc rook-ceph-mgr-dashboard-external-https --no-headers -o custom-columns=PORT:.spec.ports.*.nodePort) diff --git a/workloads/examples/k8s/ingress-loadbalancer.yml b/workloads/examples/k8s/ingress-loadbalancer.yml index 7dd47183b..eead49c3d 100644 --- a/workloads/examples/k8s/ingress-loadbalancer.yml +++ b/workloads/examples/k8s/ingress-loadbalancer.yml @@ -12,12 +12,12 @@ controller: # NodePort can be used instead where we don't have a load balancer. service: type: LoadBalancer - # Always run on master nodes + # Always run on control-plane nodes nodeSelector: - node-role.kubernetes.io/master: "" + node-role.kubernetes.io/control-plane: "" # Ingress back-end defaultBackend: - # Always run on master nodes + # Always run on control-plane nodes nodeSelector: - node-role.kubernetes.io/master: "" + node-role.kubernetes.io/control-plane: "" diff --git a/workloads/examples/k8s/ingress-nodeport.yml b/workloads/examples/k8s/ingress-nodeport.yml index 2cd6372fb..d0e0d5b82 100644 --- a/workloads/examples/k8s/ingress-nodeport.yml +++ b/workloads/examples/k8s/ingress-nodeport.yml @@ -12,12 +12,12 @@ controller: # NodePort can be used instead where we don't have a load balancer. service: type: NodePort - # Always run on master nodes + # Always run on control-plane nodes nodeSelector: - node-role.kubernetes.io/master: "" + node-role.kubernetes.io/control-plane: "" # Ingress back-end defaultBackend: - # Always run on master nodes + # Always run on control-plane nodes nodeSelector: - node-role.kubernetes.io/master: "" + node-role.kubernetes.io/control-plane: "" diff --git a/workloads/examples/k8s/services/ambassador-rbac.yml b/workloads/examples/k8s/services/ambassador-rbac.yml index 76a8d1101..760eb45e8 100644 --- a/workloads/examples/k8s/services/ambassador-rbac.yml +++ b/workloads/examples/k8s/services/ambassador-rbac.yml @@ -63,7 +63,7 @@ spec: spec: serviceAccountName: ambassador nodeSelector: - node-role.kubernetes.io/master: "true" + node-role.kubernetes.io/control-plane: "true" containers: - name: ambassador image: quay.io/datawire/ambassador:0.28.1 diff --git a/workloads/examples/k8s/services/ingress-controller.yml b/workloads/examples/k8s/services/ingress-controller.yml index 7209d114d..4bbc68e22 100644 --- a/workloads/examples/k8s/services/ingress-controller.yml +++ b/workloads/examples/k8s/services/ingress-controller.yml @@ -55,7 +55,7 @@ spec: spec: serviceAccountName: ingress nodeSelector: - node-role.kubernetes.io/master: "true" + node-role.kubernetes.io/control-plane: "true" containers: - name: nginx-ingress-controller image: quay.io/kubernetes-ingress-controller/nginx-ingress-controller:0.11.0 @@ -126,7 +126,7 @@ spec: spec: terminationGracePeriodSeconds: 60 nodeSelector: - node-role.kubernetes.io/master: "true" + node-role.kubernetes.io/control-plane: "true" containers: - name: default-http-backend image: gcr.io/google_containers/defaultbackend:1.0 diff --git a/workloads/examples/k8s/services/logging/es-statefulset.yaml b/workloads/examples/k8s/services/logging/es-statefulset.yaml index dc95db32b..61ac10379 100644 --- a/workloads/examples/k8s/services/logging/es-statefulset.yaml +++ b/workloads/examples/k8s/services/logging/es-statefulset.yaml @@ -72,7 +72,7 @@ spec: kubernetes.io/cluster-service: "true" spec: nodeSelector: - node-role.kubernetes.io/master: "true" + node-role.kubernetes.io/control-plane: "true" serviceAccountName: elasticsearch-logging containers: - image: k8s.gcr.io/elasticsearch:v5.6.4 diff --git a/workloads/examples/k8s/services/logging/kibana-deployment.yaml b/workloads/examples/k8s/services/logging/kibana-deployment.yaml index 7ec8912ee..e4f98b502 100644 --- a/workloads/examples/k8s/services/logging/kibana-deployment.yaml +++ b/workloads/examples/k8s/services/logging/kibana-deployment.yaml @@ -18,7 +18,7 @@ spec: k8s-app: kibana-logging spec: nodeSelector: - node-role.kubernetes.io/master: "true" + node-role.kubernetes.io/control-plane: "true" containers: - name: kibana-logging image: docker.elastic.co/kibana/kibana:5.6.4 diff --git a/workloads/examples/k8s/services/node-exporter.yml b/workloads/examples/k8s/services/node-exporter.yml index 86bbfa3b1..e3d8c34d3 100644 --- a/workloads/examples/k8s/services/node-exporter.yml +++ b/workloads/examples/k8s/services/node-exporter.yml @@ -29,7 +29,7 @@ spec: name: node-exporter spec: nodeSelector: - node-role.kubernetes.io/master: "true" + node-role.kubernetes.io/control-plane: "true" containers: - image: prom/node-exporter name: node-exporter diff --git a/workloads/examples/k8s/services/prometheus-monitor.yml b/workloads/examples/k8s/services/prometheus-monitor.yml index 803896c06..4ea3a25b8 100644 --- a/workloads/examples/k8s/services/prometheus-monitor.yml +++ b/workloads/examples/k8s/services/prometheus-monitor.yml @@ -58,7 +58,7 @@ spec: securityContext: fsGroup: 2000 # 2000 works but container runs as 'nobody/nobody', id/gid: 65534 nodeSelector: - node-role.kubernetes.io/master: "true" + node-role.kubernetes.io/control-plane: "true" containers: - image: quay.io/prometheus/prometheus:v2.1.0 name: prometheus @@ -183,7 +183,7 @@ data: tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt # If your node certificates are self-signed or use a different CA to the - # master CA, then disable certificate verification below. Note that + # control-plane CA, then disable certificate verification below. Note that # certificate verification is an integral part of a secure infrastructure # so this should only be disabled in a controlled environment. You can # disable certificate verification by uncommenting the line below. diff --git a/workloads/services/k8s/dgxie/values.yaml b/workloads/services/k8s/dgxie/values.yaml index 2968d7a3a..bacfcb832 100644 --- a/workloads/services/k8s/dgxie/values.yaml +++ b/workloads/services/k8s/dgxie/values.yaml @@ -49,7 +49,7 @@ resources: {} # memory: 128Mi nodeSelector: - node-role.kubernetes.io/master: "" + node-role.kubernetes.io/control-plane: "" tolerations: [] From 93b027307e7565772c3a37832c779ee4e7e9e0ab Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Mon, 10 Jul 2023 21:23:03 +0000 Subject: [PATCH 16/39] Fix multinode Jenkinsfile --- workloads/jenkins/Jenkinsfile-multi-nightly | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workloads/jenkins/Jenkinsfile-multi-nightly b/workloads/jenkins/Jenkinsfile-multi-nightly index e1e465b11..eeb13d9e0 100644 --- a/workloads/jenkins/Jenkinsfile-multi-nightly +++ b/workloads/jenkins/Jenkinsfile-multi-nightly @@ -20,7 +20,7 @@ pipeline { steps { // The only difference between the nightly and multi-nightly Jenkinsfiles should be changing GPU quantity from 1 to 2 // TODO: ideally lock should work with declared stages - lock(resource: null, label: 'gpu', quantity: 1, variable: 'GPUDATA') { + lock(resource: null, label: 'gpu', quantity: 2, variable: 'GPUDATA') { echo "Reset repo and unmunge files" sh ''' git reset --hard From 40d35ae4e8e3abc43e3546c905928d74b6119017 Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Mon, 10 Jul 2023 14:38:23 -0700 Subject: [PATCH 17/39] Version bumps for gpu operator (23.3.2), GFD (0.8.0), and device plugin (0.14.0) --- roles/nvidia-gpu-operator/defaults/main.yml | 4 ++-- roles/nvidia-k8s-gpu-device-plugin/defaults/main.yml | 2 +- roles/nvidia-k8s-gpu-feature-discovery/defaults/main.yml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/roles/nvidia-gpu-operator/defaults/main.yml b/roles/nvidia-gpu-operator/defaults/main.yml index 5322044a5..0cd7dad47 100644 --- a/roles/nvidia-gpu-operator/defaults/main.yml +++ b/roles/nvidia-gpu-operator/defaults/main.yml @@ -12,7 +12,7 @@ gpu_operator_nvaie_helm_repo: "https://helm.ngc.nvidia.com/nvaie" gpu_operator_nvaie_chart_name: "nvaie/gpu-operator" # NVAIE GPU Operator may require different version, check NGC enterprise collection. -gpu_operator_chart_version: "v22.9.2" +gpu_operator_chart_version: "v23.3.2" k8s_gpu_mig_strategy: "mixed" @@ -33,7 +33,7 @@ gpu_operator_grid_config_dir: "{{ deepops_dir }}/gpu_operator" # Defaults from https://github.com/NVIDIA/gpu-operator/blob/master/deployments/gpu-operator/values.yaml gpu_operator_default_runtime: "containerd" gpu_operator_driver_registry: "nvcr.io/nvidia" -gpu_operator_driver_version: "525.85.12" +gpu_operator_driver_version: "525.105.17" # This enables/disables NVAIE gpu_operator_nvaie_enable: false diff --git a/roles/nvidia-k8s-gpu-device-plugin/defaults/main.yml b/roles/nvidia-k8s-gpu-device-plugin/defaults/main.yml index 7895822f3..a1214ec17 100644 --- a/roles/nvidia-k8s-gpu-device-plugin/defaults/main.yml +++ b/roles/nvidia-k8s-gpu-device-plugin/defaults/main.yml @@ -2,6 +2,6 @@ k8s_gpu_plugin_helm_repo: "https://nvidia.github.io/k8s-device-plugin" k8s_gpu_plugin_chart_name: "nvdp/nvidia-device-plugin" k8s_gpu_plugin_release_name: "nvidia-device-plugin" -k8s_gpu_plugin_chart_version: "0.13.0" +k8s_gpu_plugin_chart_version: "0.14.0" k8s_gpu_plugin_init_error: "false" k8s_gpu_mig_strategy: "mixed" diff --git a/roles/nvidia-k8s-gpu-feature-discovery/defaults/main.yml b/roles/nvidia-k8s-gpu-feature-discovery/defaults/main.yml index 0a84d9051..d1cced31e 100644 --- a/roles/nvidia-k8s-gpu-feature-discovery/defaults/main.yml +++ b/roles/nvidia-k8s-gpu-feature-discovery/defaults/main.yml @@ -2,5 +2,5 @@ k8s_gpu_feature_discovery_helm_repo: "https://nvidia.github.io/gpu-feature-discovery" k8s_gpu_feature_discovery_chart_name: "nvgfd/gpu-feature-discovery" k8s_gpu_feature_discovery_release_name: "gpu-feature-discovery" -k8s_gpu_feature_discovery_chart_version: "0.7.0" +k8s_gpu_feature_discovery_chart_version: "0.8.0" k8s_gpu_mig_strategy: "mixed" From c71488e30f87439ff31cd4865832e911d7fd1191 Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Mon, 10 Jul 2023 21:47:37 +0000 Subject: [PATCH 18/39] Bump Kubespray to v2.22.1 --- submodules/kubespray | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/submodules/kubespray b/submodules/kubespray index 08467ad6b..2cf23e310 160000 --- a/submodules/kubespray +++ b/submodules/kubespray @@ -1 +1 @@ -Subproject commit 08467ad6b3bdd5b15c33e3a63d476630766bd04a +Subproject commit 2cf23e3104f9b8b20ca1aefd36e3e89be26fd090 From 495b7a6cef66fc21992dd5c8b4f84e142d7f12c1 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 11 Jul 2023 04:55:06 +0000 Subject: [PATCH 19/39] Bump Kubeflow (1.7.0) and kustomize (5.1.0) --- scripts/k8s/deploy_kubeflow.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/k8s/deploy_kubeflow.sh b/scripts/k8s/deploy_kubeflow.sh index e6591515d..955232b5d 100755 --- a/scripts/k8s/deploy_kubeflow.sh +++ b/scripts/k8s/deploy_kubeflow.sh @@ -18,7 +18,7 @@ export KUBEFLOW_DEPLOY_TIMEOUT="${KUBEFLOW_DEPLOY_TIMEOUT:-1200}" # Define Kubeflow manifests location export KUBEFLOW_MANIFESTS_DEST="${KUBEFLOW_MANIFESTS_DEST:-${CONFIG_DIR}/kubeflow-install/manifests}" export KUBEFLOW_MANIFESTS_URL="${KUBEFLOW_MANIFESTS_URL:-https://github.com/kubeflow/manifests}" -export KUBEFLOW_MANIFESTS_VERSION="${KUBEFLOW_MANIFESTS_VERSION:-v1.6.1}" +export KUBEFLOW_MANIFESTS_VERSION="${KUBEFLOW_MANIFESTS_VERSION:-v1.7.0}" # Define configuration we're injecting into the manifests location export KUBEFLOW_DEEPOPS_CONFIG_DIR="${KUBEFLOW_DEEPOPS_CONFIG_DIR:-${CONFIG_DIR}/files/kubeflow}" @@ -26,7 +26,7 @@ export KUBEFLOW_DEEPOPS_DEX_CONFIG="${KUBEFLOW_DEEPOPS_DEX_CONFIG:-${KUBEFLOW_DE export KUBEFLOW_DEEPOPS_USERNS_PARAMS="${KUBEFLOW_DEEPOPS_USERNS_PARAMS:-${KUBEFLOW_DEEPOPS_CONFIG_DIR}/user-namespace-params.env}" # Define Kustomize location -export KUSTOMIZE_URL="${KUSTOMIZE_URL:-https://github.com/kubernetes-sigs/kustomize/releases/download/v3.2.0/kustomize_3.2.0_linux_amd64}" +export KUSTOMIZE_URL="${KUSTOMIZE_URL:-https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv5.1.0/kustomize_v5.1.0_linux_amd64.tar.gz}" export KUSTOMIZE="${KUSTOMIZE:-${CONFIG_DIR}/kustomize}" function help_me() { @@ -147,7 +147,8 @@ function stand_up() { pushd . pushd "${KUBEFLOW_MANIFESTS_DEST}" - wget -O "${KUSTOMIZE}" "${KUSTOMIZE_URL}" + wget -O "${KUSTOMIZE}.tgz" "${KUSTOMIZE_URL}" + tar -xvf "${KUSTOMIZE}.tgz" -C "${CONFIG_DIR}" chmod +x "${KUSTOMIZE}" echo "Beginning Kubeflow deployment" From a25fdb2748dff406902f89828ecfe1ce0818fcca Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Tue, 11 Jul 2023 05:09:34 +0000 Subject: [PATCH 20/39] Workaround bug to add kubeflow support for K8s v1.26 --- scripts/k8s/deploy_kubeflow.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/k8s/deploy_kubeflow.sh b/scripts/k8s/deploy_kubeflow.sh index 955232b5d..84f26050a 100755 --- a/scripts/k8s/deploy_kubeflow.sh +++ b/scripts/k8s/deploy_kubeflow.sh @@ -136,6 +136,10 @@ function clone_repo() { cp -v "${KUBEFLOW_DEEPOPS_DEX_CONFIG}" "${KUBEFLOW_MANIFESTS_DEST}/common/dex/base/config-map.yaml" cp -v "${KUBEFLOW_DEEPOPS_USERNS_PARAMS}" "${KUBEFLOW_MANIFESTS_DEST}/common/user-namespace/base/params.env" + + # BUG: https://stackoverflow.com/questions/76502195/horizontalpodautoscaler-not-found-on-minikube-when-installing-kubeflow + sed -i 's:autoscaling/v2beta2:autoscaling/v2:' "${KUBEFLOW_MANIFESTS_DEST}/common/knative/knative-serving/base/upstream/serving-core.yaml" + popd echo "Kubeflow manifests repo:" echo "- Cloned from: ${KUBEFLOW_MANIFESTS_URL}" From 7729f46892b62b6b832b87b986f3955c67a78efc Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Tue, 11 Jul 2023 06:34:40 +0000 Subject: [PATCH 21/39] Update networking config for kubeflow v1.7 --- config.example/files/kubeflow/dex-config-map.yaml | 2 +- scripts/k8s/deploy_kubeflow.sh | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/config.example/files/kubeflow/dex-config-map.yaml b/config.example/files/kubeflow/dex-config-map.yaml index 6d1cccfd7..978493373 100644 --- a/config.example/files/kubeflow/dex-config-map.yaml +++ b/config.example/files/kubeflow/dex-config-map.yaml @@ -37,6 +37,6 @@ data: staticClients: # https://github.com/dexidp/dex/pull/1664 - idEnv: OIDC_CLIENT_ID - redirectURIs: ["/login/oidc"] + redirectURIs: ["/login/oidc", "/authservice/oidc/callback"] name: 'Dex Login Application' secretEnv: OIDC_CLIENT_SECRET diff --git a/scripts/k8s/deploy_kubeflow.sh b/scripts/k8s/deploy_kubeflow.sh index 84f26050a..8bcc27698 100755 --- a/scripts/k8s/deploy_kubeflow.sh +++ b/scripts/k8s/deploy_kubeflow.sh @@ -136,10 +136,12 @@ function clone_repo() { cp -v "${KUBEFLOW_DEEPOPS_DEX_CONFIG}" "${KUBEFLOW_MANIFESTS_DEST}/common/dex/base/config-map.yaml" cp -v "${KUBEFLOW_DEEPOPS_USERNS_PARAMS}" "${KUBEFLOW_MANIFESTS_DEST}/common/user-namespace/base/params.env" - # BUG: https://stackoverflow.com/questions/76502195/horizontalpodautoscaler-not-found-on-minikube-when-installing-kubeflow sed -i 's:autoscaling/v2beta2:autoscaling/v2:' "${KUBEFLOW_MANIFESTS_DEST}/common/knative/knative-serving/base/upstream/serving-core.yaml" + # XXX: Change the default Istio Ingress Gateway configuration to support NodePort for ease-of-use in on-prem + sed -i 's:ClusterIP:NodePort:g' "${KUBEFLOW_MANIFESTS_DEST}/common/istio-1-16/istio-install/base/patches/service.yaml" + popd echo "Kubeflow manifests repo:" echo "- Cloned from: ${KUBEFLOW_MANIFESTS_URL}" From c06e9ee279962589a2c8c2342452af03236421d3 Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Tue, 11 Jul 2023 07:56:37 +0000 Subject: [PATCH 22/39] Disable secure cookies in Kubeflow --- scripts/k8s/deploy_kubeflow.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/k8s/deploy_kubeflow.sh b/scripts/k8s/deploy_kubeflow.sh index 8bcc27698..69eea2187 100755 --- a/scripts/k8s/deploy_kubeflow.sh +++ b/scripts/k8s/deploy_kubeflow.sh @@ -142,6 +142,13 @@ function clone_repo() { # XXX: Change the default Istio Ingress Gateway configuration to support NodePort for ease-of-use in on-prem sed -i 's:ClusterIP:NodePort:g' "${KUBEFLOW_MANIFESTS_DEST}/common/istio-1-16/istio-install/base/patches/service.yaml" + # XXX: Make the Kubeflow cluster allow insecure http instead of https + # Remove this for any production cluster and enable HTTPS suitable for the environment + # XXX: https://github.com/kubeflow/manifests#connect-to-your-kubeflow-cluster + sed -i 's:JWA_APP_SECURE_COOKIES=true:JWA_APP_SECURE_COOKIES=false:' "${KUBEFLOW_MANIFESTS_DEST}/apps/jupyter/jupyter-web-app/upstream/base/params.env" + sed -i 's:VWA_APP_SECURE_COOKIES=true:VWA_APP_SECURE_COOKIES=false:' "${KUBEFLOW_MANIFESTS_DEST}/apps/volumes-web-app/upstream/base/params.env" + sed -i 's:TWA_APP_SECURE_COOKIES=true:TWA_APP_SECURE_COOKIES=false:' "${KUBEFLOW_MANIFESTS_DEST}/apps/tensorboard/tensorboards-web-app/upstream/base/params.env" + popd echo "Kubeflow manifests repo:" echo "- Cloned from: ${KUBEFLOW_MANIFESTS_URL}" From 4c8db3017fe880382ce73d00c6087574bc12af53 Mon Sep 17 00:00:00 2001 From: Adam Tetelman Date: Tue, 18 Jul 2023 13:13:05 -0700 Subject: [PATCH 23/39] BUG:1284 - K8s Dashboard update --- workloads/services/k8s/k8s-dashboard-admin.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/workloads/services/k8s/k8s-dashboard-admin.yml b/workloads/services/k8s/k8s-dashboard-admin.yml index fd5a48370..c972495cf 100644 --- a/workloads/services/k8s/k8s-dashboard-admin.yml +++ b/workloads/services/k8s/k8s-dashboard-admin.yml @@ -4,6 +4,16 @@ kind: ServiceAccount metadata: name: admin-user namespace: kube-system +--- +apiVersion: v1 +kind: Secret +metadata: + name: admin-user-secret + annotations: + kubernetes.io/service-account.name: admin-user + namespace: kube-system +type: kubernetes.io/service-account-token + --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding From f339cf1bff08bdb6de891e21ac33043d83ca0cd2 Mon Sep 17 00:00:00 2001 From: Douglas Holt Date: Tue, 25 Jul 2023 19:05:42 +0000 Subject: [PATCH 24/39] update roles to latest versions --- roles/requirements.yml | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/roles/requirements.yml b/roles/requirements.yml index 0948751a7..d4239423f 100644 --- a/roles/requirements.yml +++ b/roles/requirements.yml @@ -2,18 +2,18 @@ collections: - name: ansible.posix - version: 1.4.0 + version: 1.5.4 - name: community.general - version: 4.8.5 + version: 7.2.0 - name: community.docker - version: 2.7.0 + version: 3.4.8 roles: - src: dev-sec.ssh-hardening - version: "6.1.3" + version: "9.7.0" - src: https://github.com/DeepOps/ansible-role-users name: DeepOps.users @@ -24,10 +24,10 @@ roles: name: DeepOps.hosts - src: geerlingguy.ntp - version: "2.3.0" + version: "2.3.2" - src: andrewrothstein.miniconda - version: "v6.1.5" + version: "v6.1.9" - src: nvidia.nvidia_driver version: "v2.3.0" @@ -39,22 +39,22 @@ roles: version: "v0.5.0" - src: geerlingguy.filebeat - version: "3.4.0" + version: "3.5.0" - src: robertdebock.java - version: "4.1.4" + version: "4.2.0" - src: robertdebock.elastic_repo - version: "1.0.4" + version: "1.1.0" - src: robertdebock.logstash - version: "1.1.2" + version: "1.1.3" - src: robertdebock.elasticsearch - version: "1.1.5" + version: "1.1.6" - src: robertdebock.kibana - version: "1.2.5" + version: "1.2.6" - src: https://github.com/DeepOps/ansible-maas.git name: ansible-maas @@ -65,10 +65,10 @@ roles: version: 'c9022153036dfdde4e2b313aecde4a46cd6f6687' - src: https://github.com/OSC/ood-ansible.git - version: 'v2.0.6' + version: 'v3.0.3' - src: abims_sbr.singularity version: 3.7.1-4 - src: gantsign.golang - version: 2.4.0 + version: 3.1.6 From 55c302f0e17199b31d4718997a9ab1cde393ee90 Mon Sep 17 00:00:00 2001 From: Jaeho Lee Date: Thu, 27 Jul 2023 04:08:30 +0900 Subject: [PATCH 25/39] update nvidia_driver_ubuntu_cuda_keyring_package to latest version --- roles/nvidia_cuda/defaults/main.yml | 2 +- roles/nvidia_dcgm/defaults/main.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/roles/nvidia_cuda/defaults/main.yml b/roles/nvidia_cuda/defaults/main.yml index fb8a2caa1..c004528bc 100644 --- a/roles/nvidia_cuda/defaults/main.yml +++ b/roles/nvidia_cuda/defaults/main.yml @@ -26,5 +26,5 @@ nvidia_driver_rhel_cuda_repo_gpgkey: "https://developer.download.nvidia.com/comp # Ubuntu old_nvidia_driver_ubuntu_cuda_repo_gpgkey_id: "7fa2af80" nvidia_driver_ubuntu_cuda_repo_baseurl: "https://developer.download.nvidia.com/compute/cuda/repos/{{ _ubuntu_repo_dir }}" -nvidia_driver_ubuntu_cuda_keyring_package: "cuda-keyring_1.0-1_all.deb" +nvidia_driver_ubuntu_cuda_keyring_package: "cuda-keyring_1.1-1_all.deb" nvidia_driver_ubuntu_cuda_keyring_url: "{{ nvidia_driver_ubuntu_cuda_repo_baseurl }}/{{ nvidia_driver_ubuntu_cuda_keyring_package }}" diff --git a/roles/nvidia_dcgm/defaults/main.yml b/roles/nvidia_dcgm/defaults/main.yml index d81155bf0..087cf9c0c 100644 --- a/roles/nvidia_dcgm/defaults/main.yml +++ b/roles/nvidia_dcgm/defaults/main.yml @@ -10,5 +10,5 @@ nvidia_driver_rhel_cuda_repo_gpgkey: "https://developer.download.nvidia.com/comp # Ubuntu old_nvidia_driver_ubuntu_cuda_repo_gpgkey_id: "7fa2af80" nvidia_driver_ubuntu_cuda_repo_baseurl: "https://developer.download.nvidia.com/compute/cuda/repos/{{ _ubuntu_repo_dir }}" -nvidia_driver_ubuntu_cuda_keyring_package: "cuda-keyring_1.0-1_all.deb" +nvidia_driver_ubuntu_cuda_keyring_package: "cuda-keyring_1.1-1_all.deb" nvidia_driver_ubuntu_cuda_keyring_url: "{{ nvidia_driver_ubuntu_cuda_repo_baseurl }}/{{ nvidia_driver_ubuntu_cuda_keyring_package }}" From 2b95117b909bbb7ff11e7403550ac18b21a41132 Mon Sep 17 00:00:00 2001 From: Douglas Holt Date: Thu, 27 Jul 2023 08:21:40 -0700 Subject: [PATCH 26/39] Fix for docker install playbook due to kubespray changes --- playbooks/container/docker.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/playbooks/container/docker.yml b/playbooks/container/docker.yml index b2369de8d..693b87fb0 100644 --- a/playbooks/container/docker.yml +++ b/playbooks/container/docker.yml @@ -6,11 +6,12 @@ - docker vars_files: # include kubespray-defaults here so that we can set the facts using the - # kubespray 0040-set_facts.yml tasks + # kubespray 0020-set_facts.yml tasks - ../../submodules/kubespray/roles/kubespray-defaults/defaults/main.yaml + - ../../submodules/kubespray/roles/kubernetes/preinstall/defaults/main.yml tasks: - name: include kubespray task to set facts required for docker role - include: ../../submodules/kubespray/roles/kubernetes/preinstall/tasks/0040-set_facts.yml + include: ../../submodules/kubespray/roles/kubernetes/preinstall/tasks/0020-set_facts.yml when: docker_install | default('yes') - name: remove docker overrides, specifically to deal with conflicting options from DGX OS file: From b25e195dfd367f206987477ebc8cde4d3bd01bb9 Mon Sep 17 00:00:00 2001 From: Douglas Holt Date: Thu, 27 Jul 2023 14:19:27 -0700 Subject: [PATCH 27/39] add ubuntu 22.04 support --- roles/nhc/defaults/main.yml | 2 +- roles/nhc/molecule/default/molecule.yml | 6 ++++++ roles/nhc/vars/ubuntu-22.04.yml | 5 +++++ 3 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 roles/nhc/vars/ubuntu-22.04.yml diff --git a/roles/nhc/defaults/main.yml b/roles/nhc/defaults/main.yml index 90f3379ba..43753db99 100644 --- a/roles/nhc/defaults/main.yml +++ b/roles/nhc/defaults/main.yml @@ -1,5 +1,5 @@ --- -nhc_version: "1.4.2" +nhc_version: "1.4.3" nhc_src_url: "https://github.com/mej/nhc/releases/download/{{ nhc_version }}/lbnl-nhc-{{ nhc_version }}.tar.xz" nhc_install_dir: "/usr" nhc_config_dir: "/etc" diff --git a/roles/nhc/molecule/default/molecule.yml b/roles/nhc/molecule/default/molecule.yml index 839358bfa..5d20b09ff 100644 --- a/roles/nhc/molecule/default/molecule.yml +++ b/roles/nhc/molecule/default/molecule.yml @@ -16,6 +16,12 @@ platforms: - /sys/fs/cgroup:/sys/fs/cgroup:ro privileged: true pre_build_image: true + - name: nhc-ubuntu-2204 + image: geerlingguy/docker-ubuntu2204-ansible + volumes: + - /sys/fs/cgroup:/sys/fs/cgroup:ro + privileged: true + pre_build_image: true - name: nhc-centos-7 image: geerlingguy/docker-centos7-ansible volumes: diff --git a/roles/nhc/vars/ubuntu-22.04.yml b/roles/nhc/vars/ubuntu-22.04.yml new file mode 100644 index 000000000..fb17f752d --- /dev/null +++ b/roles/nhc/vars/ubuntu-22.04.yml @@ -0,0 +1,5 @@ +--- +nhc_build_deps: + - build-essential + +nhc_ssh_daemon: "sshd:" From 05f52a325a14ffc9f10cbc2e88ea590a12381382 Mon Sep 17 00:00:00 2001 From: Douglas Holt Date: Thu, 27 Jul 2023 14:19:51 -0700 Subject: [PATCH 28/39] update slurm version --- config.example/group_vars/all.yml | 4 +++- config.example/group_vars/slurm-cluster.yml | 2 +- roles/slurm/defaults/main.yml | 2 +- roles/slurm/vars/ubuntu.yml | 3 ++- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/config.example/group_vars/all.yml b/config.example/group_vars/all.yml index d9fcaf00a..a06cf6ae4 100644 --- a/config.example/group_vars/all.yml +++ b/config.example/group_vars/all.yml @@ -122,7 +122,9 @@ sftp_chroot: false ################################################################################ # NVIDIA GPU configuration # Playbook: nvidia-cuda -cuda_version: cuda-toolkit-11-5 +# Install latest version by default, +# if you want a specific version, use i.e. cuda-toolkit=12.2.0-1 +cuda_version: cuda-toolkit # DGX-specific vars may be used to target specific models, # because available versions for DGX may differ from the generic repo diff --git a/config.example/group_vars/slurm-cluster.yml b/config.example/group_vars/slurm-cluster.yml index 2d0417e20..4b68a0248 100644 --- a/config.example/group_vars/slurm-cluster.yml +++ b/config.example/group_vars/slurm-cluster.yml @@ -3,7 +3,7 @@ ################################################################################ # Slurm job scheduler configuration # Playbook: slurm, slurm-cluster, slurm-perf, slurm-perf-cluster, slurm-validation -slurm_version: "22.05.2" +slurm_version: "23.02.4" slurm_install_prefix: /usr/local pmix_install_prefix: /opt/deepops/pmix hwloc_install_prefix: /opt/deepops/hwloc diff --git a/roles/slurm/defaults/main.yml b/roles/slurm/defaults/main.yml index fcb381107..4573301db 100644 --- a/roles/slurm/defaults/main.yml +++ b/roles/slurm/defaults/main.yml @@ -7,7 +7,7 @@ hwloc_build_dir: /opt/deepops/build/hwloc pmix_build_dir: /opt/deepops/build/pmix slurm_workflow_build: yes -slurm_version: "22.05.2" +slurm_version: "23.02.4" slurm_src_url: "https://download.schedmd.com/slurm/slurm-{{ slurm_version }}.tar.bz2" slurm_build_make_clean: no slurm_build_dir_cleanup: no diff --git a/roles/slurm/vars/ubuntu.yml b/roles/slurm/vars/ubuntu.yml index 347b6144f..a6fa90769 100644 --- a/roles/slurm/vars/ubuntu.yml +++ b/roles/slurm/vars/ubuntu.yml @@ -5,8 +5,9 @@ slurm_build_deps: - build-essential - libmunge-dev - libmariadb-dev - - libmariadbclient-dev + - libmariadbclient-dev-compat - libpam0g-dev + - libdbus-1-dev - python3-minimal - ruby-dev - wget From 98a2444d293da0a6216fb83a28962808749d115e Mon Sep 17 00:00:00 2001 From: Douglas Holt Date: Thu, 27 Jul 2023 14:57:17 -0700 Subject: [PATCH 29/39] update HPC SDK versions --- config.example/group_vars/slurm-cluster.yml | 6 +++--- roles/nvidia_hpc_sdk/defaults/main.yml | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/config.example/group_vars/slurm-cluster.yml b/config.example/group_vars/slurm-cluster.yml index 4b68a0248..505d60ca4 100644 --- a/config.example/group_vars/slurm-cluster.yml +++ b/config.example/group_vars/slurm-cluster.yml @@ -137,9 +137,9 @@ sm_install_host: "slurm-master[0]" slurm_install_hpcsdk: true # Select the version of HPC SDK to download -hpcsdk_major_version: "22" -hpcsdk_minor_version: "1" -hpcsdk_file_cuda: "11.5" +hpcsdk_major_version: "23" +hpcsdk_minor_version: "5" +hpcsdk_file_cuda: "12.1" hpcsdk_arch: "x86_64" # In a Slurm cluster, default to setting up HPC SDK as modules rather than in diff --git a/roles/nvidia_hpc_sdk/defaults/main.yml b/roles/nvidia_hpc_sdk/defaults/main.yml index b54736936..ba99f3575 100644 --- a/roles/nvidia_hpc_sdk/defaults/main.yml +++ b/roles/nvidia_hpc_sdk/defaults/main.yml @@ -15,15 +15,15 @@ # See https://developer.nvidia.com/nvidia-hpc-sdk-downloads for more detail on available downloads. # Version strings used to construct download URL -hpcsdk_major_version: "22" -hpcsdk_minor_version: "1" -hpcsdk_file_cuda: "11.5" +hpcsdk_major_version: "23" +hpcsdk_minor_version: "5" +hpcsdk_file_cuda: "12.1" hpcsdk_arch: "x86_64" # We need to specify the default CUDA toolkit to use during installation. # This should usually be the latest CUDA included in the HPC SDK you are # installing. -hpcsdk_default_cuda: "11.5" +hpcsdk_default_cuda: "12.1" # Add HPC SDK modules to the MODULEPATH? hpcsdk_install_as_modules: false From d109602e348c199c666fb66a3855020e260c3653 Mon Sep 17 00:00:00 2001 From: Douglas Holt Date: Thu, 27 Jul 2023 15:05:23 -0700 Subject: [PATCH 30/39] remove duplicate variable for openmpi version --- config.example/group_vars/all.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/config.example/group_vars/all.yml b/config.example/group_vars/all.yml index a06cf6ae4..e86376413 100644 --- a/config.example/group_vars/all.yml +++ b/config.example/group_vars/all.yml @@ -277,10 +277,6 @@ deepops_dir: /opt/deepops # Roles: K8s GPU operator, GPU plugin, OpenShift/K8s deepops_venv: '{{ deepops_dir }}/venv' -# OpenMPI -# Playbook: openmpi -openmpi_version: 4.0.3 - # Disable cloud-init deepops_disable_cloud_init: true From 3311a4bf78707447ee1082f126bc979d5bdcb9c5 Mon Sep 17 00:00:00 2001 From: Douglas Holt Date: Thu, 27 Jul 2023 15:08:20 -0700 Subject: [PATCH 31/39] use latest version by default --- roles/nvidia_cuda/defaults/main.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/roles/nvidia_cuda/defaults/main.yml b/roles/nvidia_cuda/defaults/main.yml index fb8a2caa1..28a313504 100644 --- a/roles/nvidia_cuda/defaults/main.yml +++ b/roles/nvidia_cuda/defaults/main.yml @@ -1,6 +1,7 @@ --- -# 'cuda' is the generic package and will pull the latest version -cuda_version: "cuda-toolkit-11-7" +# 'cuda-toolkit' is the generic package and will pull the latest version +# if you want a specific version, use i.e. cuda-toolkit=12.2.0-1 +cuda_version: cuda-toolkit # DGX-specific vars may be used to target specific models, # because available versions for DGX may differ from the generic repo From 06b6f20c195b7f083c6bac79ce89034c575d8d02 Mon Sep 17 00:00:00 2001 From: Douglas Holt Date: Thu, 27 Jul 2023 16:01:56 -0700 Subject: [PATCH 32/39] update version --- roles/singularity_wrapper/defaults/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/singularity_wrapper/defaults/main.yml b/roles/singularity_wrapper/defaults/main.yml index 5be75a6a4..08ee8d83b 100644 --- a/roles/singularity_wrapper/defaults/main.yml +++ b/roles/singularity_wrapper/defaults/main.yml @@ -1,6 +1,6 @@ --- # vars for lecorguille.singularity -singularity_version: "3.7.3" +singularity_version: "3.11.4" singularity_conf_path: "/etc/singularity/singularity.conf" bind_paths: [] From 8ff806dc278f3503c7c3bb3f74628a311ce2f0c6 Mon Sep 17 00:00:00 2001 From: Douglas Holt Date: Thu, 27 Jul 2023 16:02:30 -0700 Subject: [PATCH 33/39] move some version defaults out of config to simplify updates --- config.example/group_vars/all.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/config.example/group_vars/all.yml b/config.example/group_vars/all.yml index e86376413..e041ae430 100644 --- a/config.example/group_vars/all.yml +++ b/config.example/group_vars/all.yml @@ -124,7 +124,7 @@ sftp_chroot: false # Playbook: nvidia-cuda # Install latest version by default, # if you want a specific version, use i.e. cuda-toolkit=12.2.0-1 -cuda_version: cuda-toolkit +# cuda_version: cuda-toolkit # DGX-specific vars may be used to target specific models, # because available versions for DGX may differ from the generic repo @@ -148,9 +148,9 @@ nvidia_driver_force_install: false # Docker configuration # Playbook: docker, nvidia-docker, k8s-cluster # -# For supported Docker versions, see: kubespray/roles/container-engine/docker/vars/* +# For supported Docker versions, see: submodules/kubespray/roles/container-engine/docker/vars/* docker_install: yes -docker_version: '20.10' +# docker_version: 'latest' docker_dns_servers_strict: no docker_storage_options: -s overlay2 #docker_options: "--bip=192.168.99.1/24" @@ -198,7 +198,9 @@ enroot_environ_config_files_dgx: # Singularity configuration # Playbook: singularity, slurm-cluster # Singularity target version -singularity_version: "3.7.3" +# set an alternate singularity version here; +# see roles/singularity_wrapper/defaults/main.yml for default +# singularity_version: singularity_conf_path: "/etc/singularity/singularity.conf" bind_paths: [] # example: From e118b3ad321a7d5eb03b27b60598e01cf6e626f1 Mon Sep 17 00:00:00 2001 From: Douglas Holt Date: Wed, 2 Aug 2023 14:38:00 -0700 Subject: [PATCH 34/39] set default newer driver version --- config.example/group_vars/all.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/config.example/group_vars/all.yml b/config.example/group_vars/all.yml index e041ae430..ce104c3a5 100644 --- a/config.example/group_vars/all.yml +++ b/config.example/group_vars/all.yml @@ -121,6 +121,9 @@ sftp_chroot: false # NVIDIA # ################################################################################ # NVIDIA GPU configuration +# Playbook nvidia-driver +nvidia_driver_branch: 530 + # Playbook: nvidia-cuda # Install latest version by default, # if you want a specific version, use i.e. cuda-toolkit=12.2.0-1 From 59af37039d8b4eff064b32103957b52a2444cae5 Mon Sep 17 00:00:00 2001 From: Douglas Holt Date: Wed, 2 Aug 2023 14:46:00 -0700 Subject: [PATCH 35/39] remove driver version --- config.example/group_vars/all.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/config.example/group_vars/all.yml b/config.example/group_vars/all.yml index ce104c3a5..e041ae430 100644 --- a/config.example/group_vars/all.yml +++ b/config.example/group_vars/all.yml @@ -121,9 +121,6 @@ sftp_chroot: false # NVIDIA # ################################################################################ # NVIDIA GPU configuration -# Playbook nvidia-driver -nvidia_driver_branch: 530 - # Playbook: nvidia-cuda # Install latest version by default, # if you want a specific version, use i.e. cuda-toolkit=12.2.0-1 From c2f0aa8c629bf49efb9edbff7a3e129d5369a077 Mon Sep 17 00:00:00 2001 From: Douglas Holt Date: Thu, 3 Aug 2023 10:02:17 -0700 Subject: [PATCH 36/39] move config to roles --- config.example/group_vars/slurm-cluster.yml | 14 +++++++------- roles/nvidia_hpc_sdk/defaults/main.yml | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/config.example/group_vars/slurm-cluster.yml b/config.example/group_vars/slurm-cluster.yml index 505d60ca4..688517fc8 100644 --- a/config.example/group_vars/slurm-cluster.yml +++ b/config.example/group_vars/slurm-cluster.yml @@ -3,7 +3,7 @@ ################################################################################ # Slurm job scheduler configuration # Playbook: slurm, slurm-cluster, slurm-perf, slurm-perf-cluster, slurm-validation -slurm_version: "23.02.4" +# slurm_version: "" slurm_install_prefix: /usr/local pmix_install_prefix: /opt/deepops/pmix hwloc_install_prefix: /opt/deepops/hwloc @@ -137,10 +137,10 @@ sm_install_host: "slurm-master[0]" slurm_install_hpcsdk: true # Select the version of HPC SDK to download -hpcsdk_major_version: "23" -hpcsdk_minor_version: "5" -hpcsdk_file_cuda: "12.1" -hpcsdk_arch: "x86_64" +#hpcsdk_major_version: "" +#hpcsdk_minor_version: "" +#hpcsdk_file_cuda: "" +#hpcsdk_arch: "x86_64" # In a Slurm cluster, default to setting up HPC SDK as modules rather than in # the default user environment @@ -156,7 +156,7 @@ hpcsdk_install_in_path: false # this can help you get started. ################################################################################ slurm_cluster_install_openmpi: false -openmpi_version: 4.0.4 +#openmpi_version: openmpi_install_prefix: "/usr/local" openmpi_configure: "./configure --prefix={{ openmpi_install_prefix }} --disable-dependency-tracking --disable-getpwuid --with-pmix={{ pmix_install_prefix }} --with-hwloc={{ hwloc_install_prefix }} --with-pmi={{ slurm_install_prefix }} --with-slurm={{ slurm_install_prefix }} --with-libevent=/usr" @@ -185,7 +185,7 @@ allow_user_set_gpu_clocks: no ################################################################################ slurm_install_enroot: true slurm_install_pyxis: true -slurm_pyxis_version: 0.11.1 +#slurm_pyxis_version: # /run is default partition of pyxis runtime_path resize_run_partition: false diff --git a/roles/nvidia_hpc_sdk/defaults/main.yml b/roles/nvidia_hpc_sdk/defaults/main.yml index ba99f3575..cceb25f41 100644 --- a/roles/nvidia_hpc_sdk/defaults/main.yml +++ b/roles/nvidia_hpc_sdk/defaults/main.yml @@ -16,7 +16,7 @@ # Version strings used to construct download URL hpcsdk_major_version: "23" -hpcsdk_minor_version: "5" +hpcsdk_minor_version: "7" hpcsdk_file_cuda: "12.1" hpcsdk_arch: "x86_64" From b6bdffb3cb286573e91499bee99851ca48c41a0c Mon Sep 17 00:00:00 2001 From: Douglas Holt Date: Thu, 3 Aug 2023 11:13:58 -0700 Subject: [PATCH 37/39] update version --- roles/nvidia_hpc_sdk/defaults/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/roles/nvidia_hpc_sdk/defaults/main.yml b/roles/nvidia_hpc_sdk/defaults/main.yml index cceb25f41..4c4a0eed2 100644 --- a/roles/nvidia_hpc_sdk/defaults/main.yml +++ b/roles/nvidia_hpc_sdk/defaults/main.yml @@ -17,13 +17,13 @@ # Version strings used to construct download URL hpcsdk_major_version: "23" hpcsdk_minor_version: "7" -hpcsdk_file_cuda: "12.1" +hpcsdk_file_cuda: "12.2" hpcsdk_arch: "x86_64" # We need to specify the default CUDA toolkit to use during installation. # This should usually be the latest CUDA included in the HPC SDK you are # installing. -hpcsdk_default_cuda: "12.1" +hpcsdk_default_cuda: "12.2" # Add HPC SDK modules to the MODULEPATH? hpcsdk_install_as_modules: false From 5d98c7869642aa522dd45314296da44c8b065e5a Mon Sep 17 00:00:00 2001 From: Douglas Holt Date: Mon, 7 Aug 2023 13:53:24 -0700 Subject: [PATCH 38/39] update release tag --- README.md | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 094224950..6a0c3ea7d 100644 --- a/README.md +++ b/README.md @@ -29,27 +29,7 @@ The DeepOps project encapsulates best practices in the deployment of GPU server - An existing cluster that needs a resource manager / batch scheduler, where DeepOps is used to install Slurm or Kubernetes - A single machine where no scheduler is desired, only NVIDIA drivers, Docker, and the NVIDIA Container Runtime -## Releases Notes - -Latest release: [DeepOps 22.08 Release](https://github.com/NVIDIA/deepops/releases/tag/22.08) - -- Kubernetes Default Components: - - - [kubernetes](https://github.com/kubernetes/kubernetes) v1.22.8 - - [etcd](https://github.com/coreos/etcd) v3.5.0 - - [docker](https://www.docker.com/) v20.10 - - [containerd](https://containerd.io/) v1.5.8 - - [cri-o](http://cri-o.io/) v1.22 - - [calico](https://github.com/projectcalico/calico) v3.20.3 - - [dashboard](https://github.com/kubernetes/dashboard/tree/master) v2.0.3 - - [dashboard metrics scraper](https://github.com/kubernetes-sigs/dashboard-metrics-scraper/tree/master) v1.0.4 - - [nvidia gpu operator](https://github.com/NVIDIA/gpu-operator/tree/master) 1.10.0 - -- Slurm Default Components: - - - [slurm](https://github.com/SchedMD/slurm/tree/master) 21.08.8-2 - - [Singularity](https://github.com/apptainer/singularity/tree/master) 3.7.3 - - [docker](https://www.docker.com/) v20.10 +Latest release: [DeepOps 23.08 Release](https://github.com/NVIDIA/deepops/releases/tag/23.08) It is recommended to use the latest release branch for stable code (linked above). All development takes place on the master branch, which is generally [functional](docs/deepops/testing.md) but may change significantly between releases. @@ -60,7 +40,7 @@ It is recommended to use the latest release branch for stable code (linked above The provisioning system is used to orchestrate the running of all playbooks and one will be needed when instantiating Kubernetes or Slurm clusters. Supported operating systems which are tested and supported include: - NVIDIA DGX OS 4, 5 -- Ubuntu 18.04 LTS, 20.04 LTS +- Ubuntu 18.04 LTS, 20.04, 22.04 LTS - CentOS 7, 8 ### Cluster System @@ -68,7 +48,7 @@ The provisioning system is used to orchestrate the running of all playbooks and The cluster nodes will follow the requirements described by Slurm or Kubernetes. You may also use a cluster node as a provisioning system but it is not required. - NVIDIA DGX OS 4, 5 -- Ubuntu 18.04 LTS, 20.04 LTS +- Ubuntu 18.04 LTS, 20.04, 22.04 LTS - CentOS 7, 8 You may also install a supported operating system on all servers via a 3rd-party solution (i.e. [MAAS](https://maas.io/), [Foreman](https://www.theforeman.org/)) or utilize the provided [OS install container](docs/pxe/minimal-pxe-container.md). From 22ef6c0bb54a9c72f1d8d3abd2480635b602d2e3 Mon Sep 17 00:00:00 2001 From: Douglas Holt Date: Mon, 7 Aug 2023 13:54:22 -0700 Subject: [PATCH 39/39] remove release notes section --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 6a0c3ea7d..4476ffc6e 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,6 @@ Infrastructure automation tools for Kubernetes and Slurm clusters with NVIDIA GP - [DeepOps](#deepops) - [Table of Contents](#table-of-contents) - [Overview](#overview) - - [Releases Notes](#releases-notes) - [Deployment Requirements](#deployment-requirements) - [Provisioning System](#provisioning-system) - [Cluster System](#cluster-system)