A3U TAS support (#416)

* A3U TAS support * merge 2 versions support * sign * a3m & a3u generic scheduler * boilerplate fix * readme
GoogleCloudPlatform · Nov 19, 2024 · 723ec14 · 723ec14
1 parent f2cab24
commit 723ec14
Show file tree

Hide file tree

Showing 6 changed files with 1,035 additions and 0 deletions.
diff --git a/gke-topology-scheduler/README.md b/gke-topology-scheduler/README.md
@@ -0,0 +1,53 @@
+## Overview
+
+This document gives instructions on how to enable topology in GKE clusters on
+A3M & A3U machines clusters.
+
+The general outline for this to be successful is:
+- We add labels for topology to nodes in the cluster with a daemonset
+- We handle pod scheduling with a scheduling daemon
+- Pods with the added scheduleGate are picked up and scheduled
+
+## Prerequisites
+
+For topology awareness to be enabled in A3M, a GKE node pool has to be created with
+compact placement. Specifically, the `physical_host` attribute
+[ref](https://cloud.google.com/compute/docs/instances/use-compact-placement-policies#verify-vm-location)
+should be present for each GPU node in the cluster.
+
+## Configuration
+
+To initialize Kubernetes authentication for scripts:
+
+```gcloud container clusters get-credentials [cluster name] --zone [cluster zone] --project [project id]```
+
+## Usage
+
+First copy this folder locally
+
+Next create config maps for scripts required by pods
+
+-   Run `kubectl create configmap topology-scheduler-scripts --namespace
+    kube-system --from-file=schedule-daemon.py=schedule-daemon.py
+    --from-file=label-nodes-daemon.py=label-nodes-daemon.py`
+
+Next apply the service account config to the cluster:
+
+-   Apply `service-account.yaml` config to the cluster by running `kubectl apply
+    -f service-account.yaml`.
+
+Now apply the scheduling and label daemons to the cluster so that pods will
+automatically be scheduled with the correct schedulingGates
+
+-   Apply `schedule-daemon.yaml` daemonset to the cluster by running `kubectl
+    apply -f schedule-daemon.yaml`.
+-   If GKE <1.31, apply `label-nodes-daemon.yaml` daemonset
+    to the cluster by running `kubectl apply -f label-nodes-daemon.yaml`.
+
+To let the daemon "pick up" the workload for scheduling, simply add a
+schedulingGate that starts with ”gke.io/topology-aware-auto-”, for example:
+
+```
+  schedulingGates:
+  - name: "gke.io/topology-aware-auto-my-job-name"
+```
diff --git a/gke-topology-scheduler/label-nodes-daemon.py b/gke-topology-scheduler/label-nodes-daemon.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+
+# Copyright 2024 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Daemon to update Kubernetes node labels based on GCE VM metadata."""
+
+import time
+from typing import Dict
+
+from kubernetes import client
+from kubernetes import config
+import requests
+
+
+def update_node_labels(kube: client.CoreV1Api) -> None:
+  """Updates Kubernetes node labels based on GCE VM metadata."""
+  node_name_url = "http://metadata.google.internal/computeMetadata/v1/instance/name"
+  metadata_url = "http://metadata.google.internal/computeMetadata/v1/instance/attributes/physical_host"
+  headers = {"Metadata-Flavor": "Google"}
+
+  response = requests.get(node_name_url, headers=headers)
+
+  if response.status_code == 200:
+    node_name = response.text
+  else:
+    print("Node name not found")
+    return
+
+  response = requests.get(metadata_url, headers=headers)
+
+  if response.status_code == 200:
+    physical_host = response.text
+  else:
+    print("physical host not found")
+    return
+
+  cluster, rack, host = physical_host.split("/")[1:]
+
+  node_labels: Dict[str, str] = {
+      "topology.gke.io/cluster": cluster,
+      "topology.gke.io/rack": rack,
+      "topology.gke.io/host": host,
+  }
+
+  kube.patch_node(node_name, {"metadata": {"labels": node_labels}})  # type: ignore
+  print(f"Updated labels on node {node_name}: {node_labels}")
+
+
+if __name__ == "__main__":
+  # Kubernetes configuration
+  config.load_incluster_config()
+  client = client.CoreV1Api()
+
+  while True:
+    print("Starting node update")
+    # Update node labels
+    update_node_labels(client)
+    time.sleep(600)
diff --git a/gke-topology-scheduler/label-nodes-daemon.yaml b/gke-topology-scheduler/label-nodes-daemon.yaml
@@ -0,0 +1,35 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: label-nodes-daemon
+  namespace: kube-system
+spec:
+  selector:
+    matchLabels:
+      name: label-nodes-daemon
+  template:
+    metadata:
+      labels:
+        name: label-nodes-daemon
+    spec:
+      tolerations:
+      - operator: "Exists"
+        key: nvidia.com/gpu
+      hostNetwork: true
+      containers:
+      - name: label-nodes-daemon
+        image: python:3.10
+        command:
+        - bash
+        - -c
+        - |
+          pip install kubernetes
+          python -u /scripts/label-nodes-daemon.py
+        volumeMounts:
+        - name: scripts-volume
+          mountPath: /scripts
+      volumes:
+      - name: scripts-volume
+        configMap:
+          name: topology-scheduler-scripts
+      serviceAccount: topology-scheduler