Skip to content

Commit

Permalink
Merge pull request #425 from chengcongdu/develop
Browse files Browse the repository at this point in the history
add node READY state check for TAS
  • Loading branch information
chengcongdu authored Dec 20, 2024
2 parents 3140dbc + 9944d76 commit 04688ae
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 1 deletion.
10 changes: 10 additions & 0 deletions gke-topology-scheduler/schedule-daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,16 @@ def find_schedulable_nodes(
)
continue

# skip nodes that is not in Ready state
if any(
condition.type == "Ready" and condition.status != "True" for condition in node.status.conditions
):
logging.info(
'Skipping node %s because it is NotReady',
node_name
)
continue

allocatable = node.status.allocatable
used_cpu, used_memory, used_gpu = 0, 0, 0

Expand Down
11 changes: 10 additions & 1 deletion gpudirect-tcpxo/topology-scheduler/schedule-daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def find_schedulable_nodes(nodes, pods, tolerated_taints):
continue

skip_node = False
# check node taints
if node.spec.taints is not None:
for t in node.spec.taints:
if t.key not in tolerated_taint_dict:
Expand All @@ -154,8 +155,14 @@ def find_schedulable_nodes(nodes, pods, tolerated_taints):
else:
tol = tolerated_taint_dict[t.key]
if tol.operator == "Equal" and tol.value != t.value:
print(f'Skipping node {node_name} because it is tainted with key {t.key} with value {t.value}')
skip_node = True
break
# check node status
if any(condition.type == "Ready" and condition.status != "True" for condition in node.status.conditions):
print(f'Skipping node {node_name} because it is NotReady')
skip_node = True
break

if skip_node:
continue
Expand Down Expand Up @@ -321,7 +328,9 @@ def schedule_pod_on_node(v1, pod_name, pod_namespace, node, gate_name):

v1.replace_namespaced_pod(pod_name, pod_namespace, pod)

print(f'Pod {pod_namespace}/{pod_name} scheduled on {node['name']} with topology: {node_topology_key(node)}')
print(
'Pod %s/%s scheduled on %s with topology %s', pod_namespace, pod_name, node['name'], node_topology_key(node)
)
except ApiException as e:
print(f'Exception when removing scheduling gate: {e}')

Expand Down

0 comments on commit 04688ae

Please sign in to comment.